arhivach-downloader

Download arhivach.vc threads
git clone https://git.ea.contact/arhivach-downloader
Log | Files | Refs | README

commit 0458300db0e2dadd557187538424b7cde8c42082
parent d3652bbd77e194b6a8194e4b9e635d77cb0f9205
Author: egor-achkasov <eaachkasov@gmail.com>
Date:   Sun, 22 Feb 2026 00:03:28 +0000

Use anyhow for results

Diffstat:
MCargo.lock | 20++++++++++++++++++++
MCargo.toml | 2++
Msrc/export.rs | 46++++++++++++++++++++++++++++------------------
Msrc/main.rs | 28+++++++++++++---------------
Msrc/parse_args.rs | 5+++--
Msrc/post.rs | 23++++++++++++-----------
6 files changed, 78 insertions(+), 46 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock @@ -62,14 +62,22 @@ dependencies = [ ] [[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] name = "archivarch-downloader" version = "0.1.0" dependencies = [ + "anyhow", "clap", "regex", "reqwest", "scraper", "tokio", + "tracing", ] [[package]] @@ -1597,10 +1605,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ "pin-project-lite", + "tracing-attributes", "tracing-core", ] [[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] name = "tracing-core" version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/Cargo.toml b/Cargo.toml @@ -5,8 +5,10 @@ version = "0.1.0" edition = "2024" [dependencies] +anyhow = "1.0.102" clap = { version = "4.5.57", features = ["derive"] } regex = "1.12.3" reqwest = { version = "0.12", features = ["blocking"] } scraper = "0.25.0" tokio = { version = "1.49.0", features = ["macros", "rt", "rt-multi-thread"] } +tracing = "0.1.44" diff --git a/src/export.rs b/src/export.rs @@ -1,4 +1,7 @@ use crate::post::Post; +use crate::file::File; + +use anyhow::{Result, Context}; fn html_escape(s: &str) -> String { s.replace('&', "&amp;") @@ -46,9 +49,9 @@ pub async fn export2html( posts: Vec<Post>, download_files: bool, download_thumbnails: bool, -) -> Result<(), Box<dyn std::error::Error>> { +) -> Result<()> { if posts.is_empty() { - return Err("No posts to export".into()); + anyhow::bail!("No posts to export"); } // Create directories @@ -62,26 +65,29 @@ pub async fn export2html( .collect::<Vec<String>>() .join("\n"); // Download files - if download_files { - let dir = format!("{}/files", dir); - std::fs::create_dir_all(&dir)?; + async fn download_helper( + base_dir: &str, + subdir: &str, + posts: &[Post], + get_url: fn(&File) -> &str, + ) -> Result<()>{ + let dir = format!("{}/{}", base_dir, subdir); + std::fs::create_dir_all(&dir) + .with_context(|| format!("Failed to create directory {}", dir))?; for (f, filename) in posts.iter().flat_map(|p| &p.files) .filter_map(|f| f.url.split('/').last().map(|name| (f, name))) { let path = format!("{}/{}", dir, filename); - download(&f.url, &path).await?; + download(get_url(f), &path).await + .with_context(|| format!("Failed to download file {}", path))?; } + Ok(()) + } + if download_files { + download_helper(&dir, "files", &posts, |f| &f.url).await?; } - // Download thumbnails if download_thumbnails { - let dir = format!("{}/thumb", dir); - std::fs::create_dir_all(&dir)?; - for (f, filename) in posts.iter().flat_map(|p| &p.files) - .filter_map(|f| f.url_thumb.split('/').last().map(|name| (f, name))) - { - let path = format!("{}/{}", dir, filename); - download(&f.url_thumb, &path).await?; - } + download_helper(&dir, "thumb", &posts, |f| &f.url_thumb).await?; } // Insert the posts html into a template and write as index.html @@ -186,8 +192,12 @@ fn render_images( } -async fn download(url: &str, path: &str) -> Result<(), Box<dyn std::error::Error>> { - let bytes = reqwest::get(url).await?.bytes().await?; - std::fs::write(path, &bytes)?; +async fn download(url: &str, path: &str) -> Result<()> { + let bytes = reqwest::get(url).await + .with_context(|| format!("HTTP GET failed for {}", url))? + .bytes().await + .context("failed to read response body")?; + std::fs::write(path, &bytes) + .with_context(|| format!("failed to write {}", path))?; Ok(()) } diff --git a/src/main.rs b/src/main.rs @@ -6,26 +6,23 @@ mod export; use parse_args::{Config, parse_args}; use post::Post; -async fn scrape_thread(url: &str, config: &Config) -> Result<(), Box<dyn std::error::Error>> { - // Validate URL (expect https?://arhivach\.vc/thread/\d{7}/?) - let is_valid = matches!( - url.trim().trim_end_matches('/').split('/').collect::<Vec<_>>().as_slice(), - ["https:" | "http:", "", "arhivach.vc", "thread", _] - ); - if !is_valid { - return Err("invalid URL".into()); - } - - let html = reqwest::get(url).await?.text().await?; - let posts = Post::parse_posts(&html)?; - export::export2html(posts, config.files, config.thumb).await?; - +use anyhow::{Context, Ok, Result}; + +async fn scrape_thread(url: &str, config: &Config) -> Result<()> { + let html = reqwest::get(url).await + .with_context(|| format!("HTTP GET failed for {url}"))? + .text().await + .context("failed to read response body")?; + let posts = Post::parse_posts(&html) + .context("failed to parse thread HTML")?; + export::export2html(posts, config.files, config.thumb).await + .context("failed to export thread")?; Ok(()) } #[tokio::main] -async fn main() { +async fn main() -> Result<()>{ let config = parse_args() .unwrap_or_else(|e| { eprintln!("Error parsing arguments: {}", e); @@ -39,4 +36,5 @@ async fn main() { } println!("Done"); + Ok(()) } diff --git a/src/parse_args.rs b/src/parse_args.rs @@ -1,4 +1,5 @@ use clap::Parser; +use anyhow::Result; use std::path::PathBuf; @@ -8,7 +9,7 @@ pub struct Config{ pub files: bool } -pub fn parse_args() -> Result<Config, Box<dyn std::error::Error>> { +pub fn parse_args() -> Result<Config> { #[derive(Parser)] #[command(about, long_about)] struct Cli { @@ -41,7 +42,7 @@ pub fn parse_args() -> Result<Config, Box<dyn std::error::Error>> { } } if urls.is_empty() { - return Err("No URLs provided".into()); + anyhow::bail!("No URLs provided"); } Ok(Config { diff --git a/src/post.rs b/src/post.rs @@ -1,5 +1,7 @@ use crate::file::File; +use anyhow::{Context, Result}; + /// Represents a single post in a thread #[derive(Debug, Clone)] pub struct Post { @@ -23,7 +25,7 @@ pub struct Post { impl Post { pub fn parse_posts( html: &str, - ) -> Result<Vec<Post>, Box<dyn std::error::Error>> { + ) -> Result<Vec<Post>> { let mut posts = Vec::new(); let document = scraper::Html::parse_document(html); @@ -45,7 +47,7 @@ impl Post { /// <span class="post_comment">...</span> (see parse_post_comment function) /// </div> /// ``` - fn parse_post(node: scraper::ElementRef) -> Result<Post, Box<dyn std::error::Error>> { + fn parse_post(node: scraper::ElementRef) -> Result<Post> { static SEL_POST_HEAD: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( || scraper::Selector::parse("div.post_head").unwrap() ); @@ -56,13 +58,13 @@ impl Post { let post_head = node .select(&SEL_POST_HEAD) .next() - .ok_or("missing post_head")?; + .context("missing post_head")?; let (subject, name, mailto, time, num, id) = Post::parse_post_head(post_head)?; let post_comment = node .select(&SEL_POST_IMAGE_BLOCK) .next() - .ok_or("missing post_comment")?; + .context("missing post_comment")?; let (files, text) = Post::parse_post_comment(post_comment)?; Ok(Post { @@ -104,8 +106,7 @@ impl Post { String, // time String, // num u32 // id - ), - Box<dyn std::error::Error> + ) > { static SEL_SPAN_POST_ID_A_HREF: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( || scraper::Selector::parse("span.post_id a[href]").unwrap() @@ -131,7 +132,7 @@ impl Post { .next() .and_then(|el| el.value().attr("href")) .and_then(|href| href.strip_prefix('#')) - .ok_or("missing post id")? + .context("missing post id")? .parse()?; let subject = post_head @@ -154,14 +155,14 @@ impl Post { let time = post_head .select(&SEL_SPAN_POST_TIME) .next() - .ok_or("missing post_time")? + .context("missing post_time")? .text() .collect::<String>(); let num = post_head .select(&SEL_SPAN_POST_NUM) .next() - .ok_or("missing post_num")? + .context("missing post_num")? .text() .collect::<String>(); @@ -179,7 +180,7 @@ impl Post { /// </span> fn parse_post_comment( node: scraper::ElementRef, - ) -> Result<(Vec<File>, String), Box<dyn std::error::Error>> { + ) -> Result<(Vec<File>, String)> { static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( || scraper::Selector::parse("div.post_image_block").unwrap() ); @@ -195,7 +196,7 @@ impl Post { let text = Post::parse_post_comment_body(node .select(&SEL_POST_COMMENT_BODY) .next() - .ok_or("missing post_comment_body")?); + .context("missing post_comment_body")?); Ok((files, text)) }