arhivach-downloader

Download arhivach.vc threads
git clone https://git.ea.contact/arhivach-downloader
Log | Files | Refs | README

commit 70a26e088ac34b471c3b4705d52f6793dbb138c4
parent 552a60776429c2f61e0cf188d48334e445176d27
Author: egor-achkasov <eaachkasov@gmail.com>
Date:   Wed, 25 Feb 2026 16:00:24 +0000

Refactor towards libfirst arch

Diffstat:
Asrc/backend.rs | 81+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/cli.rs | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/config.rs | 7+++++++
Asrc/events.rs | 23+++++++++++++++++++++++
Msrc/export.rs | 51+++++++++++++++++++++++++++++++++++++--------------
Asrc/lib.rs | 6++++++
Msrc/main.rs | 127++++++++++++++++++++++++++++++++++++++++---------------------------------------
Dsrc/parse_args.rs | 59-----------------------------------------------------------
8 files changed, 273 insertions(+), 135 deletions(-)

diff --git a/src/backend.rs b/src/backend.rs @@ -0,0 +1,81 @@ +use std::sync::mpsc::Sender; + +use anyhow::{Context, Ok, Result}; +use std::result::Result::Ok as StdOk; + +use crate::{config::Config, events::Event, export, post::Post}; + +pub fn fetch_with_retry(url: &str, attempts: u32, tx: &Sender<Event>) -> Result<String> { + for attempt in 1..=attempts { + match reqwest::blocking::get(url).and_then(|r| r.text()) { + StdOk(text) => return Ok(text), + Err(e) => { + tx.send(Event::FetchRetrying { + url: url.to_string(), + attempt, + max_attempts: attempts, + error: e.to_string(), + }).ok(); + if attempt < attempts { + std::thread::sleep(std::time::Duration::from_secs(3)); + } + } + } + } + anyhow::bail!("failed to get thread after {attempts} attempts") +} + +pub fn scrape_thread(url: &str, config: &Config, tx: &Sender<Event>) -> Result<Post> { + let t_total = std::time::Instant::now(); + + tx.send(Event::FetchStarted { url: url.to_string() }).ok(); + let t = std::time::Instant::now(); + let html = fetch_with_retry(url, 3, tx)?; + tx.send(Event::FetchDone { elapsed_ms: t.elapsed().as_millis() }).ok(); + + tx.send(Event::ParseStarted).ok(); + let t = std::time::Instant::now(); + let posts = Post::parse_posts(&html).context("failed to parse thread HTML")?; + tx.send(Event::ParseDone { + post_count: posts.len(), + elapsed_ms: t.elapsed().as_millis(), + }).ok(); + + let first_post = posts.first().context("thread has no posts")?.clone(); + + export::export2html(&posts, config, tx).context("failed to export thread")?; + + tx.send(Event::ThreadDone { + url: url.to_string(), + elapsed_ms: t_total.elapsed().as_millis(), + }).ok(); + + Ok(first_post) +} + +pub fn run(config: &Config, tx: Sender<Event>) -> Result<()> { + let total = config.urls.len(); + let mut first_posts: Vec<Post> = Vec::new(); + + for (i, url) in config.urls.iter().enumerate() { + tx.send(Event::ThreadStarted { + url: url.clone(), + index: i + 1, + total, + }).ok(); + + match scrape_thread(url, config, &tx) { + StdOk(first_post) => first_posts.push(first_post), + Err(e) => { + tx.send(Event::ThreadFailed { + url: url.clone(), + error: format!("{:#}", e), + }).ok(); + } + } + } + + export::write_index_html(&first_posts, config).context("failed to write main index.html")?; + + Ok(()) +} diff --git a/src/cli.rs b/src/cli.rs @@ -0,0 +1,54 @@ +use clap::Parser; +use anyhow::Result; + +use std::path::PathBuf; + +use arhivarch_downloader::config::Config; + +pub fn parse_args() -> Result<Config> { + #[derive(Parser)] + #[command(about, long_about)] + struct Cli { + /// URL to download + url: Option<String>, + + /// Path to a text file containing a list of URLs (one per line) + #[arg(short = 'l', long = "list")] + list: Option<PathBuf>, + + /// Download thumbnail images, default: false + #[arg(short = 't', long = "thumb", default_value_t = false)] + thumb: bool, + + /// Download files (images, videos, gifs, etc), default: false + #[arg(short = 'f', long = "files", default_value_t = false)] + files: bool, + + /// Resume files and thumbnails downloading instead of overwriting. Useless if neither -t nor -f are set, default: false + #[arg(short = 'r', long = "resume", default_value_t = false)] + resume: bool + } + let cli = Cli::parse(); + + let mut urls = Vec::new(); + // [URL] + if let Some(url) = cli.url { + urls.push(url); + } + // [List] + if let Some(list) = cli.list { + for line in std::fs::read_to_string(list)?.lines() { + urls.push(line.to_string()); + } + } + if urls.is_empty() { + anyhow::bail!("No URLs provided"); + } + + Ok(Config { + urls, + thumb: cli.thumb, + files: cli.files, + resume: cli.resume, + }) +} diff --git a/src/config.rs b/src/config.rs @@ -0,0 +1,7 @@ +#[derive(Debug, Clone)] +pub struct Config { + pub urls: Vec<String>, + pub thumb: bool, + pub files: bool, + pub resume: bool, +} diff --git a/src/events.rs b/src/events.rs @@ -0,0 +1,23 @@ +#[derive(Debug, Clone)] +pub enum Event { + // Thread-level lifecycle + ThreadStarted { url: String, index: usize, total: usize }, + ThreadDone { url: String, elapsed_ms: u128 }, + ThreadFailed { url: String, error: String }, + + // HTTP fetch + FetchStarted { url: String }, + FetchDone { elapsed_ms: u128 }, + FetchRetrying { url: String, attempt: u32, max_attempts: u32, error: String }, + + // HTML parsing + ParseStarted, + ParseDone { post_count: usize, elapsed_ms: u128 }, + + // Asset downloading + DownloadBatchStarted { label: String, total_posts: usize }, + DownloadBatchProgress { label: String, done: usize, total: usize }, + DownloadAssetFailed { label: String, filename: String, attempt: u32, error: String }, + DownloadAssetSkipped { label: String, filename: String }, + DownloadBatchDone { label: String, elapsed_ms: u128 }, +} diff --git a/src/export.rs b/src/export.rs @@ -1,4 +1,6 @@ -use crate::{parse_args::Config, post::Post}; +use std::sync::mpsc::Sender; + +use crate::{config::Config, events::Event, post::Post}; use anyhow::{Result, Context}; @@ -39,7 +41,7 @@ fn render_text_to_html(text: &str) -> String { } /// Write a top-level index.html with one entry per thread (first post + link to thread folder) -pub fn write_index_html(first_posts: &[Post], config: &Config) -> Result<()> { +pub(crate) fn write_index_html(first_posts: &[Post], config: &Config) -> Result<()> { if first_posts.is_empty() { return Ok(()); } @@ -78,7 +80,7 @@ pub fn write_index_html(first_posts: &[Post], config: &Config) -> Result<()> { /// If download_thumbnails is true, downloads thumbnails to ./{thread_id}/thumb /// /// WARNING: If the directory already exists, it will be overwritten -pub fn export2html(posts: &[Post], config: &Config) -> Result<()> { +pub(crate) fn export2html(posts: &[Post], config: &Config, tx: &Sender<Event>) -> Result<()> { if posts.is_empty() { anyhow::bail!("No posts to export"); } @@ -99,6 +101,7 @@ pub fn export2html(posts: &[Post], config: &Config) -> Result<()> { "files", |f| &f.url, config.resume, + tx, )?; } if config.thumb { @@ -108,6 +111,7 @@ pub fn export2html(posts: &[Post], config: &Config) -> Result<()> { "thumbnails", |f| &f.url_thumb, config.resume, + tx, )?; } @@ -213,38 +217,57 @@ fn download_assets( label: &str, url_of: impl Fn(&crate::file::File) -> &str, skip_if_exists: bool, + tx: &Sender<Event>, ) -> Result<()> { - use std::io::Write; - std::fs::create_dir_all(dest_dir) .with_context(|| format!("Failed to create directory {}", dest_dir))?; + let t = std::time::Instant::now(); - print!("\tDownloading {}... post 0 / {}", label, posts.len()); - std::io::stdout().flush().ok(); + tx.send(Event::DownloadBatchStarted { + label: label.to_string(), + total_posts: posts.len(), + }).ok(); + for (i, post) in posts.iter().enumerate() { for f in &post.files { let url = url_of(f); - let filename = url.split('/').last().unwrap_or(""); + let filename = url.split('/').last().unwrap_or("").to_string(); let path = format!("{}/{}", dest_dir, filename); if skip_if_exists && std::path::Path::new(&path).exists() { continue; } let mut result = Err(anyhow::anyhow!("no attempts")); - for _ in 0..3 { + for attempt in 1..=3u32 { result = download(url, &path); if result.is_ok() { break; } let e = result.as_ref().unwrap_err(); - println!("\r\tFailed to download {} {}: {}\n\t-> Waiting 3 seconds...", label, filename, e); + tx.send(Event::DownloadAssetFailed { + label: label.to_string(), + filename: filename.clone(), + attempt, + error: e.to_string(), + }).ok(); std::thread::sleep(std::time::Duration::from_secs(3)); } if result.is_err() { - println!("\tSkipping {} {} after 3 failed attempts.", label, filename); + tx.send(Event::DownloadAssetSkipped { + label: label.to_string(), + filename: filename.clone(), + }).ok(); } } - print!("\r\tDownloading {}... post {} / {}", label, i + 1, posts.len()); - std::io::stdout().flush().ok(); + tx.send(Event::DownloadBatchProgress { + label: label.to_string(), + done: i + 1, + total: posts.len(), + }).ok(); } - println!(" Done ({} ms)", t.elapsed().as_millis()); + + tx.send(Event::DownloadBatchDone { + label: label.to_string(), + elapsed_ms: t.elapsed().as_millis(), + }).ok(); + Ok(()) } diff --git a/src/lib.rs b/src/lib.rs @@ -0,0 +1,6 @@ +pub mod config; +pub mod events; +pub mod backend; +pub mod post; +pub mod file; +pub(crate) mod export; diff --git a/src/main.rs b/src/main.rs @@ -1,77 +1,80 @@ -mod parse_args; -mod post; -mod file; -mod export; - -use parse_args::{Config, parse_args}; -use post::Post; - -use anyhow::{Context, Ok, Result}; -use std::result::Result::Ok as StdOk; - -fn fetch_with_retry(url: &str, attempts: u32) -> Result<String> { - for attempt in 1..=attempts { - match reqwest::blocking::get(url).and_then(|r| r.text()) { - StdOk(text) => return Ok(text), - Err(e) => { - eprintln!("\n\tHTTP request failed for {url}: {e}"); - if attempt < attempts { - eprintln!("\tWaiting 3 seconds..."); - std::thread::sleep(std::time::Duration::from_secs(3)); - } - } - } +mod cli; + +use arhivarch_downloader::{backend, events::Event}; +use std::sync::mpsc; + +fn main() -> anyhow::Result<()> { + let config = cli::parse_args().unwrap_or_else(|e| { + eprintln!("Error: {}", e); + std::process::exit(1); + }); + + let (tx, rx) = mpsc::channel::<Event>(); + + let handle = std::thread::spawn({ + let config = config.clone(); + move || backend::run(&config, tx) + }); + + for event in rx { + render_event(&event); } - anyhow::bail!("failed to get thread after {attempts} attempts") + + handle.join().unwrap() } -fn scrape_thread(url: &str, config: &Config) -> Result<Post> { +fn render_event(event: &Event) { use std::io::Write; - let t_total = std::time::Instant::now(); + match event { + Event::ThreadStarted { url, index, total } => + println!("Processing {} ({} / {}):", url, index, total), - print!("\tGetting thread..."); - std::io::stdout().flush().ok(); - let t = std::time::Instant::now(); - let html = fetch_with_retry(url, 3)?; - println!(" Done ({} ms)", t.elapsed().as_millis()); + Event::ThreadDone { url, elapsed_ms } => + println!("Done processing {} ({} ms)", url, elapsed_ms), - print!("\tParsing posts..."); - std::io::stdout().flush().ok(); - let t = std::time::Instant::now(); - let posts = Post::parse_posts(&html) - .context("failed to parse thread HTML")?; - println!(" Done ({} ms)", t.elapsed().as_millis()); + Event::ThreadFailed { url, error } => + eprintln!("Error processing {}: {}", url, error), - let first_post = posts.first().context("thread has no posts")?.clone(); + Event::FetchStarted { .. } => { + print!("\tGetting thread..."); + std::io::stdout().flush().ok(); + } - export::export2html(&posts, &config) - .context("failed to export thread")?; + Event::FetchDone { elapsed_ms } => + println!(" Done ({} ms)", elapsed_ms), - println!("Done processing {} ({} ms)", url, t_total.elapsed().as_millis()); - Ok(first_post) -} + Event::FetchRetrying { url, attempt, max_attempts, error } => { + eprintln!("\n\tHTTP request failed for {}: {}", url, error); + if attempt < max_attempts { + eprintln!("\tWaiting 3 seconds..."); + } + } + Event::ParseStarted => { + print!("\tParsing posts..."); + std::io::stdout().flush().ok(); + } -fn main() -> Result<()> { - let config = parse_args() - .unwrap_or_else(|e| { - eprintln!("Error: {}", e); - std::process::exit(1); - }); - - let mut first_posts: Vec<Post> = Vec::new(); - let mut i = 1; - for url in &config.urls { - println!("Processing {} ({} / {}):", url, i, config.urls.len()); - i += 1; - match scrape_thread(url, &config) { - StdOk(first_post) => first_posts.push(first_post), - Err(e) => eprintln!("Error processing {}: {:#}", url, e), + Event::ParseDone { elapsed_ms, .. } => + println!(" Done ({} ms)", elapsed_ms), + + Event::DownloadBatchStarted { label, total_posts } => { + print!("\tDownloading {}... post 0 / {}", label, total_posts); + std::io::stdout().flush().ok(); } - } - export::write_index_html(&first_posts, &config) - .context("failed to write main index.html")?; + Event::DownloadBatchProgress { label, done, total } => { + print!("\r\tDownloading {}... post {} / {}", label, done, total); + std::io::stdout().flush().ok(); + } + + Event::DownloadAssetFailed { label, filename, error, .. } => + println!("\r\tFailed to download {} {}: {}\n\t-> Waiting 3 seconds...", label, filename, error), - Ok(()) + Event::DownloadAssetSkipped { label, filename } => + println!("\tSkipping {} {} after 3 failed attempts.", label, filename), + + Event::DownloadBatchDone { elapsed_ms, .. } => + println!(" Done ({} ms)", elapsed_ms), + } } diff --git a/src/parse_args.rs b/src/parse_args.rs @@ -1,59 +0,0 @@ -use clap::Parser; -use anyhow::Result; - -use std::path::PathBuf; - -pub struct Config{ - pub urls: Vec<String>, - pub thumb: bool, - pub files: bool, - pub resume: bool, -} - -pub fn parse_args() -> Result<Config> { - #[derive(Parser)] - #[command(about, long_about)] - struct Cli { - /// URL to download - url: Option<String>, - - /// Path to a text file containing a list of URLs (one per line) - #[arg(short = 'l', long = "list")] - list: Option<PathBuf>, - - /// Download thumbnail images, default: false - #[arg(short = 't', long = "thumb", default_value_t = false)] - thumb: bool, - - /// Download files (images, videos, gifs, etc), default: false - #[arg(short = 'f', long = "files", default_value_t = false)] - files: bool, - - /// Resume files and thumbnails downloading instead of overwriting. Useless if neither -t nor -f are set, default: false - #[arg(short = 'r', long = "resume", default_value_t = false)] - resume: bool - } - let cli = Cli::parse(); - - let mut urls = Vec::new(); - // [URL] - if let Some(url) = cli.url { - urls.push(url); - } - // [List] - if let Some(list) = cli.list { - for line in std::fs::read_to_string(list)?.lines() { - urls.push(line.to_string()); - } - } - if urls.is_empty() { - anyhow::bail!("No URLs provided"); - } - - Ok(Config { - urls, - thumb: cli.thumb, - files: cli.files, - resume: cli.resume, - }) -}