arhivach-downloader

Download arhivach.vc threads
git clone https://git.ea.contact/arhivach-downloader
Log | Files | Refs | README

commit 90a47f47d40d2af5be72a86f068660e6eced45da
parent 0458300db0e2dadd557187538424b7cde8c42082
Author: egor-achkasov <eaachkasov@gmail.com>
Date:   Sun, 22 Feb 2026 19:36:47 +0000

Improve info and error output

Diffstat:
MCargo.lock | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
MCargo.toml | 1+
Msrc/export.rs | 75+++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Msrc/main.rs | 35+++++++++++++++++++++++++++++------
Msrc/post.rs | 2++
5 files changed, 160 insertions(+), 32 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock @@ -78,6 +78,7 @@ dependencies = [ "scraper", "tokio", "tracing", + "tracing-subscriber", ] [[package]] @@ -733,6 +734,12 @@ dependencies = [ ] [[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] name = "libc" version = "0.2.180" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -783,6 +790,15 @@ dependencies = [ ] [[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] name = "memchr" version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -829,6 +845,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" [[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1333,6 +1358,15 @@ dependencies = [ ] [[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1485,6 +1519,15 @@ dependencies = [ ] [[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] name = "tinystr" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1627,6 +1670,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", ] [[package]] @@ -1684,6 +1757,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/Cargo.toml b/Cargo.toml @@ -12,3 +12,4 @@ reqwest = { version = "0.12", features = ["blocking"] } scraper = "0.25.0" tokio = { version = "1.49.0", features = ["macros", "rt", "rt-multi-thread"] } tracing = "0.1.44" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } diff --git a/src/export.rs b/src/export.rs @@ -1,7 +1,7 @@ use crate::post::Post; -use crate::file::File; use anyhow::{Result, Context}; +use tracing::debug; fn html_escape(s: &str) -> String { s.replace('&', "&amp;") @@ -54,47 +54,27 @@ pub async fn export2html( anyhow::bail!("No posts to export"); } - // Create directories let dir = format!("{}", posts[0].id); std::fs::create_dir_all(&dir)?; - // Render the thread let posts_html: String = posts .iter() .map(|p| render_post(p, download_files, download_thumbnails)) .collect::<Vec<String>>() .join("\n"); - // Download files - async fn download_helper( - base_dir: &str, - subdir: &str, - posts: &[Post], - get_url: fn(&File) -> &str, - ) -> Result<()>{ - let dir = format!("{}/{}", base_dir, subdir); - std::fs::create_dir_all(&dir) - .with_context(|| format!("Failed to create directory {}", dir))?; - for (f, filename) in posts.iter().flat_map(|p| &p.files) - .filter_map(|f| f.url.split('/').last().map(|name| (f, name))) - { - let path = format!("{}/{}", dir, filename); - download(get_url(f), &path).await - .with_context(|| format!("Failed to download file {}", path))?; - } - Ok(()) - } + if download_files { - download_helper(&dir, "files", &posts, |f| &f.url).await?; + download_assets(&posts, &format!("{}/files", dir), "files", |f| &f.url).await?; } + if download_thumbnails { - download_helper(&dir, "thumb", &posts, |f| &f.url_thumb).await?; + download_assets(&posts, &format!("{}/thumb", dir), "thumbnails", |f| &f.url_thumb).await?; } - // Insert the posts html into a template and write as index.html let template = std::fs::read_to_string("template.html")? .replace("{{posts}}", &posts_html); std::fs::write(format!("{}/index.html", dir), template)?; - + Ok(()) } @@ -192,6 +172,49 @@ fn render_images( } +async fn download_assets( + posts: &[Post], + dest_dir: &str, + label: &str, + url_of: impl Fn(&crate::file::File) -> &str, +) -> Result<()> { + use std::io::Write; + + std::fs::create_dir_all(dest_dir) + .with_context(|| format!("Failed to create directory {}", dest_dir))?; + let t = std::time::Instant::now(); + print!("\tDownloading {}... post 0 / {}", label, posts.len()); + std::io::stdout().flush().ok(); + for (i, post) in posts.iter().enumerate() { + for f in &post.files { + let url = url_of(f); + let filename = url.split('/').last().unwrap_or(""); + let path = format!("{}/{}", dest_dir, filename); + debug!(url = %url, %path, "Downloading {}", label); + let mut failed = false; + for attempt in 0..3 { + match download(url, &path).await { + Ok(()) => { failed = false; break; } + Err(e) => { + failed = true; + println!("\r\tFailed to download {} {}: {}\n\t-> Waiting 3 seconds...", label, filename, e); + if attempt < 2 { + tokio::time::sleep(std::time::Duration::from_secs(3)).await; + } + } + } + } + if failed { + println!("\tSkipping {} {} after 3 failed attempts.", label, filename); + } + } + print!("\r\tDownloading {}... post {} / {}", label, i + 1, posts.len()); + std::io::stdout().flush().ok(); + } + println!(" Done ({} ms)", t.elapsed().as_millis()); + Ok(()) +} + async fn download(url: &str, path: &str) -> Result<()> { let bytes = reqwest::get(url).await .with_context(|| format!("HTTP GET failed for {}", url))? diff --git a/src/main.rs b/src/main.rs @@ -9,32 +9,55 @@ use post::Post; use anyhow::{Context, Ok, Result}; async fn scrape_thread(url: &str, config: &Config) -> Result<()> { + use std::io::Write; + let t_total = std::time::Instant::now(); + + print!("\tGetting thread..."); + std::io::stdout().flush().ok(); + let t = std::time::Instant::now(); let html = reqwest::get(url).await .with_context(|| format!("HTTP GET failed for {url}"))? .text().await .context("failed to read response body")?; + println!(" Done ({} ms)", t.elapsed().as_millis()); + + print!("\tParsing posts..."); + std::io::stdout().flush().ok(); + let t = std::time::Instant::now(); let posts = Post::parse_posts(&html) .context("failed to parse thread HTML")?; + println!(" Done ({} ms)", t.elapsed().as_millis()); + export::export2html(posts, config.files, config.thumb).await .context("failed to export thread")?; + + println!("Done processing {} ({} ms)", url, t_total.elapsed().as_millis()); Ok(()) } #[tokio::main] -async fn main() -> Result<()>{ +async fn main() -> Result<()> { + tracing_subscriber::fmt() + .without_time() + .with_target(false) + .with_env_filter( + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("warn")) + ) + .init(); + let config = parse_args() .unwrap_or_else(|e| { - eprintln!("Error parsing arguments: {}", e); + eprintln!("Error: {}", e); std::process::exit(1); }); - for (i, url) in config.urls.iter().enumerate() { - println!("Processing: {} ({} / {})", url, i + 1, config.urls.len()); + for url in &config.urls { + println!("Processing {}:", url); scrape_thread(url, &config).await - .unwrap_or_else(|e| eprintln!("Error processing {}: {}", url, e)); + .unwrap_or_else(|e| eprintln!("Error processing {}: {:#}", url, e)); } - println!("Done"); Ok(()) } diff --git a/src/post.rs b/src/post.rs @@ -1,6 +1,7 @@ use crate::file::File; use anyhow::{Context, Result}; +use tracing::debug; /// Represents a single post in a thread #[derive(Debug, Clone)] @@ -35,6 +36,7 @@ impl Post { posts.push(post); } + debug!("Parsed {} posts", posts.len()); Ok(posts) }