arhivach-downloader

Download arhivach.vc threads
git clone https://git.ea.contact/arhivach-downloader
Log | Files | Refs | README

commit f0b2401e13487a7382c1bb1e531f1e389afab516
parent c45b7960d9e8527bab3ac0b984e814e1c8f32dca
Author: egor-achkasov <eaachkasov@gmail.com>
Date:   Thu, 26 Feb 2026 15:38:42 +0000

Add export module

Diffstat:
Msrc/backend.rs | 16++++++++--------
Msrc/bin/cli/main.rs | 4++--
Dsrc/export.rs | 146-------------------------------------------------------------------------------
Asrc/export/html/mod.rs | 142+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rsrc/render.rs -> src/export/html/render.rs | 0
Asrc/export/mod.rs | 8++++++++
Msrc/lib.rs | 2+-
7 files changed, 161 insertions(+), 157 deletions(-)

diff --git a/src/backend.rs b/src/backend.rs @@ -1,19 +1,19 @@ use anyhow::{Context, Ok, Result}; use std::result::Result::Ok as StdOk; -use crate::{config::Config, events::{Event, Reporter}, export, http, post::Post}; +use crate::{config::Config, events::{Event, Reporter}, export::{Export, html}, http, post::Post}; -pub fn scrape_thread(url: &str, config: &Config, reporter: &dyn Reporter) -> Result<Post> { +pub fn scrape_thread(url: &str, config: &Config, reporter: &dyn Reporter, exporter: &dyn Export) -> Result<Post> { let t_total = std::time::Instant::now(); reporter.report(Event::FetchStarted { url: url.to_string() }); let t = std::time::Instant::now(); - let html = http::fetch_with_retry(url, 3, reporter)?; + let html_content = http::fetch_with_retry(url, 3, reporter)?; reporter.report(Event::FetchDone { elapsed_ms: t.elapsed().as_millis() }); reporter.report(Event::ParseStarted); let t = std::time::Instant::now(); - let posts = Post::parse_posts(&html).context("failed to parse thread HTML")?; + let posts = Post::parse_posts(&html_content).context("failed to parse thread HTML")?; reporter.report(Event::ParseDone { post_count: posts.len(), elapsed_ms: t.elapsed().as_millis(), @@ -21,7 +21,7 @@ pub fn scrape_thread(url: &str, config: &Config, reporter: &dyn Reporter) -> Res let first_post = posts.first().context("thread has no posts")?.clone(); - export::export2html(&posts, config, reporter).context("failed to export thread")?; + exporter.export(&posts, config, reporter).context("failed to export thread")?; reporter.report(Event::ThreadDone { url: url.to_string(), @@ -31,7 +31,7 @@ pub fn scrape_thread(url: &str, config: &Config, reporter: &dyn Reporter) -> Res Ok(first_post) } -pub fn run(config: &Config, reporter: &dyn Reporter) -> Result<()> { +pub fn run(config: &Config, reporter: &dyn Reporter, exporter: &dyn Export) -> Result<()> { let total = config.urls.len(); let mut first_posts: Vec<Post> = Vec::new(); @@ -42,7 +42,7 @@ pub fn run(config: &Config, reporter: &dyn Reporter) -> Result<()> { total, }); - match scrape_thread(url, config, reporter) { + match scrape_thread(url, config, reporter, exporter) { StdOk(first_post) => first_posts.push(first_post), Err(e) => { reporter.report(Event::ThreadFailed { @@ -53,7 +53,7 @@ pub fn run(config: &Config, reporter: &dyn Reporter) -> Result<()> { } } - export::write_index_html(&first_posts, config).context("failed to write main index.html")?; + html::write_index_html(&first_posts, config).context("failed to write main index.html")?; Ok(()) } diff --git a/src/bin/cli/main.rs b/src/bin/cli/main.rs @@ -1,4 +1,4 @@ -use arhivarch_downloader::{backend, events::Event, config::Config}; +use arhivarch_downloader::{backend, events::Event, config::Config, HtmlExporter}; use clap::Parser; use anyhow::Result; @@ -16,7 +16,7 @@ fn main() -> anyhow::Result<()> { let handle = std::thread::spawn({ let config = config.clone(); - move || backend::run(&config, &tx) + move || backend::run(&config, &tx, &HtmlExporter) }); for event in rx { diff --git a/src/export.rs b/src/export.rs @@ -1,146 +0,0 @@ -use crate::{config::Config, events::{Event, Reporter}, http, post::{File, Post}, render}; - -use anyhow::{Result, Context}; - -const TEMPLATE: &str = include_str!("../template.html"); - -/// Write a top-level index.html with one entry per thread (first post + link to thread folder) -pub fn write_index_html(first_posts: &[Post], config: &Config) -> Result<()> { - if first_posts.is_empty() { - return Ok(()); - } - - let posts_html: String = first_posts - .iter() - .map(|p| { - let mut post_html = render::render_post(p, config.files, config.thumb); - // render_post references thumbnails and images in the same directory, - // so replace them with links to the thread folder - config.files.then(|| post_html = post_html.replace( - "<a href=\"files/", - &format!("<a href=\"{}/files/", p.id), - )); - config.thumb.then(|| post_html = post_html.replace( - "<img src=\"thumb/", - &format!("<img src=\"{}/thumb/", p.id), - )); - format!("<div><a href=\"{}/index.html\">В тред &rarr;</a></div>{}\n", p.id, post_html) - }) - .collect::<Vec<String>>() - .join("\n"); - - let index_html = TEMPLATE.replace("{{posts}}", &posts_html); - std::fs::write("index.html", index_html) - .context("failed to write index.html")?; - - Ok(()) -} - -/// Export the thread to a simple static HTML -/// -/// Creates a directory as follows: -/// ./{thread_id}, where {thread_id} is OP ID -/// If download_files is true, downloads files to ./{thread_id}/files -/// If download_thumbnails is true, downloads thumbnails to ./{thread_id}/thumb -/// -/// WARNING: If the directory already exists, it will be overwritten -pub fn export2html(posts: &[Post], config: &Config, reporter: &dyn Reporter) -> Result<()> { - if posts.is_empty() { - anyhow::bail!("No posts to export"); - } - - let dir = format!("{}", posts[0].id); - std::fs::create_dir_all(&dir)?; - - let posts_html: String = posts - .iter() - .map(|p| render::render_post(p, config.files, config.thumb)) - .collect::<Vec<String>>() - .join("\n"); - - if config.files { - download_assets( - &posts, - &format!("{}/files", dir), - "files", - |f| &f.url, - config.resume, - reporter, - )?; - } - if config.thumb { - download_assets( - &posts, - &format!("{}/thumb", dir), - "thumbnails", - |f| &f.url_thumb, - config.resume, - reporter, - )?; - } - - let index_html = TEMPLATE.replace("{{posts}}", &posts_html); - std::fs::write(format!("{}/index.html", dir), index_html)?; - - Ok(()) -} - -fn download_assets( - posts: &[Post], - dest_dir: &str, - label: &str, - url_of: impl Fn(&File) -> &str, - skip_if_exists: bool, - reporter: &dyn Reporter, -) -> Result<()> { - std::fs::create_dir_all(dest_dir) - .with_context(|| format!("Failed to create directory {}", dest_dir))?; - - let t = std::time::Instant::now(); - reporter.report(Event::DownloadBatchStarted { - label: label.to_string(), - total_posts: posts.len(), - }); - - for (i, post) in posts.iter().enumerate() { - for f in &post.files { - let url = url_of(f); - let filename = url.split('/').last().unwrap_or("").to_string(); - let path = format!("{}/{}", dest_dir, filename); - if skip_if_exists && std::path::Path::new(&path).exists() { - continue; - } - let mut result = Err(anyhow::anyhow!("no attempts")); - for attempt in 1..=3u32 { - result = http::download(url, &path); - if result.is_ok() { break; } - let e = result.as_ref().unwrap_err(); - reporter.report(Event::DownloadAssetFailed { - label: label.to_string(), - filename: filename.clone(), - attempt, - error: e.to_string(), - }); - std::thread::sleep(std::time::Duration::from_secs(3)); - } - if result.is_err() { - reporter.report(Event::DownloadAssetSkipped { - label: label.to_string(), - filename: filename.clone(), - }); - } - } - reporter.report(Event::DownloadBatchProgress { - label: label.to_string(), - done: i + 1, - total: posts.len(), - }); - } - - reporter.report(Event::DownloadBatchDone { - label: label.to_string(), - elapsed_ms: t.elapsed().as_millis(), - }); - - Ok(()) -} diff --git a/src/export/html/mod.rs b/src/export/html/mod.rs @@ -0,0 +1,142 @@ +use crate::{config::Config, events::{Event, Reporter}, http, post::{File, Post}}; +use anyhow::{Result, Context}; +use super::Export; + +mod render; + +const TEMPLATE: &str = include_str!("../../../template.html"); + +pub struct HtmlExporter; + +impl Export for HtmlExporter { + fn export(&self, posts: &[Post], config: &Config, reporter: &dyn Reporter) -> Result<()> { + if posts.is_empty() { + anyhow::bail!("No posts to export"); + } + + let dir = format!("{}", posts[0].id); + std::fs::create_dir_all(&dir)?; + + let posts_html: String = posts + .iter() + .map(|p| render::render_post(p, config.files, config.thumb)) + .collect::<Vec<String>>() + .join("\n"); + + if config.files { + download_assets( + &posts, + &format!("{}/files", dir), + "files", + |f| &f.url, + config.resume, + reporter, + )?; + } + if config.thumb { + download_assets( + &posts, + &format!("{}/thumb", dir), + "thumbnails", + |f| &f.url_thumb, + config.resume, + reporter, + )?; + } + + let index_html = TEMPLATE.replace("{{posts}}", &posts_html); + std::fs::write(format!("{}/index.html", dir), index_html)?; + + Ok(()) + } +} + +/// Write a top-level index.html with one entry per thread (first post + link to thread folder) +pub fn write_index_html(first_posts: &[Post], config: &Config) -> Result<()> { + if first_posts.is_empty() { + return Ok(()); + } + + let posts_html: String = first_posts + .iter() + .map(|p| { + let mut post_html = render::render_post(p, config.files, config.thumb); + config.files.then(|| post_html = post_html.replace( + "<a href=\"files/", + &format!("<a href=\"{}/files/", p.id), + )); + config.thumb.then(|| post_html = post_html.replace( + "<img src=\"thumb/", + &format!("<img src=\"{}/thumb/", p.id), + )); + format!("<div><a href=\"{}/index.html\">В тред &rarr;</a></div>{}\n", p.id, post_html) + }) + .collect::<Vec<String>>() + .join("\n"); + + let index_html = TEMPLATE.replace("{{posts}}", &posts_html); + std::fs::write("index.html", index_html) + .context("failed to write index.html")?; + + Ok(()) +} + +fn download_assets( + posts: &[Post], + dest_dir: &str, + label: &str, + url_of: impl Fn(&File) -> &str, + skip_if_exists: bool, + reporter: &dyn Reporter, +) -> Result<()> { + std::fs::create_dir_all(dest_dir) + .with_context(|| format!("Failed to create directory {}", dest_dir))?; + + let t = std::time::Instant::now(); + reporter.report(Event::DownloadBatchStarted { + label: label.to_string(), + total_posts: posts.len(), + }); + + for (i, post) in posts.iter().enumerate() { + for f in &post.files { + let url = url_of(f); + let filename = url.split('/').last().unwrap_or("").to_string(); + let path = format!("{}/{}", dest_dir, filename); + if skip_if_exists && std::path::Path::new(&path).exists() { + continue; + } + let mut result = Err(anyhow::anyhow!("no attempts")); + for attempt in 1..=3u32 { + result = http::download(url, &path); + if result.is_ok() { break; } + let e = result.as_ref().unwrap_err(); + reporter.report(Event::DownloadAssetFailed { + label: label.to_string(), + filename: filename.clone(), + attempt, + error: e.to_string(), + }); + std::thread::sleep(std::time::Duration::from_secs(3)); + } + if result.is_err() { + reporter.report(Event::DownloadAssetSkipped { + label: label.to_string(), + filename: filename.clone(), + }); + } + } + reporter.report(Event::DownloadBatchProgress { + label: label.to_string(), + done: i + 1, + total: posts.len(), + }); + } + + reporter.report(Event::DownloadBatchDone { + label: label.to_string(), + elapsed_ms: t.elapsed().as_millis(), + }); + + Ok(()) +} diff --git a/src/render.rs b/src/export/html/render.rs diff --git a/src/export/mod.rs b/src/export/mod.rs @@ -0,0 +1,8 @@ +use crate::{config::Config, events::Reporter, post::Post}; +use anyhow::Result; + +pub mod html; + +pub trait Export { + fn export(&self, posts: &[Post], config: &Config, reporter: &dyn Reporter) -> Result<()>; +} diff --git a/src/lib.rs b/src/lib.rs @@ -3,7 +3,7 @@ pub mod events; pub mod backend; pub mod post; pub mod http; -pub mod render; pub mod export; pub use events::{Reporter, NullReporter}; +pub use export::html::HtmlExporter;