arhivach-downloader

Download arhivach.vc threads
git clone https://git.ea.contact/arhivach-downloader
Log | Files | Refs | README

commit c45b7960d9e8527bab3ac0b984e814e1c8f32dca
parent 207a68d04aca44e318f2b4474bc76c937066836c
Author: egor-achkasov <eaachkasov@gmail.com>
Date:   Thu, 26 Feb 2026 15:18:18 +0000

Refactor to llib+bin arch

Diffstat:
MCargo.toml | 4++++
Msrc/backend.rs | 56+++++++++++++++++---------------------------------------
Asrc/bin/cli/main.rs | 131+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/cli.rs | 54------------------------------------------------------
Msrc/events.rs | 22++++++++++++++++++++++
Msrc/export.rs | 178++++++++++---------------------------------------------------------------------
Dsrc/file.rs | 22----------------------
Asrc/http.rs | 35+++++++++++++++++++++++++++++++++++
Msrc/lib.rs | 7+++++--
Dsrc/main.rs | 80-------------------------------------------------------------------------------
Msrc/post.rs | 68++++++++++++++++++++++++++++++++++++++++++++------------------------
Asrc/render.rs | 125+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
12 files changed, 404 insertions(+), 378 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml @@ -4,6 +4,10 @@ description = "Download threads from arhivach." version = "0.1.0" edition = "2024" +[[bin]] +name = "arhivach-downloader-cli" +path = "src/bin/cli/main.rs" + [dependencies] anyhow = "1.0.102" clap = { version = "4.5.57", features = ["derive"] } diff --git a/src/backend.rs b/src/backend.rs @@ -1,76 +1,54 @@ -use std::sync::mpsc::Sender; - use anyhow::{Context, Ok, Result}; use std::result::Result::Ok as StdOk; -use crate::{config::Config, events::Event, export, post::Post}; - -pub fn fetch_with_retry(url: &str, attempts: u32, tx: &Sender<Event>) -> Result<String> { - for attempt in 1..=attempts { - match reqwest::blocking::get(url).and_then(|r| r.text()) { - StdOk(text) => return Ok(text), - Err(e) => { - tx.send(Event::FetchRetrying { - url: url.to_string(), - attempt, - max_attempts: attempts, - error: e.to_string(), - }).ok(); - if attempt < attempts { - std::thread::sleep(std::time::Duration::from_secs(3)); - } - } - } - } - anyhow::bail!("failed to get thread after {attempts} attempts") -} +use crate::{config::Config, events::{Event, Reporter}, export, http, post::Post}; -pub fn scrape_thread(url: &str, config: &Config, tx: &Sender<Event>) -> Result<Post> { +pub fn scrape_thread(url: &str, config: &Config, reporter: &dyn Reporter) -> Result<Post> { let t_total = std::time::Instant::now(); - tx.send(Event::FetchStarted { url: url.to_string() }).ok(); + reporter.report(Event::FetchStarted { url: url.to_string() }); let t = std::time::Instant::now(); - let html = fetch_with_retry(url, 3, tx)?; - tx.send(Event::FetchDone { elapsed_ms: t.elapsed().as_millis() }).ok(); + let html = http::fetch_with_retry(url, 3, reporter)?; + reporter.report(Event::FetchDone { elapsed_ms: t.elapsed().as_millis() }); - tx.send(Event::ParseStarted).ok(); + reporter.report(Event::ParseStarted); let t = std::time::Instant::now(); let posts = Post::parse_posts(&html).context("failed to parse thread HTML")?; - tx.send(Event::ParseDone { + reporter.report(Event::ParseDone { post_count: posts.len(), elapsed_ms: t.elapsed().as_millis(), - }).ok(); + }); let first_post = posts.first().context("thread has no posts")?.clone(); - export::export2html(&posts, config, tx).context("failed to export thread")?; + export::export2html(&posts, config, reporter).context("failed to export thread")?; - tx.send(Event::ThreadDone { + reporter.report(Event::ThreadDone { url: url.to_string(), elapsed_ms: t_total.elapsed().as_millis(), - }).ok(); + }); Ok(first_post) } -pub fn run(config: &Config, tx: Sender<Event>) -> Result<()> { +pub fn run(config: &Config, reporter: &dyn Reporter) -> Result<()> { let total = config.urls.len(); let mut first_posts: Vec<Post> = Vec::new(); for (i, url) in config.urls.iter().enumerate() { - tx.send(Event::ThreadStarted { + reporter.report(Event::ThreadStarted { url: url.clone(), index: i + 1, total, - }).ok(); + }); - match scrape_thread(url, config, &tx) { + match scrape_thread(url, config, reporter) { StdOk(first_post) => first_posts.push(first_post), Err(e) => { - tx.send(Event::ThreadFailed { + reporter.report(Event::ThreadFailed { url: url.clone(), error: format!("{:#}", e), - }).ok(); + }); } } } diff --git a/src/bin/cli/main.rs b/src/bin/cli/main.rs @@ -0,0 +1,131 @@ +use arhivarch_downloader::{backend, events::Event, config::Config}; + +use clap::Parser; +use anyhow::Result; + +use std::path::PathBuf; +use std::sync::mpsc; + +fn main() -> anyhow::Result<()> { + let config = parse_args().unwrap_or_else(|e| { + eprintln!("Error: {}", e); + std::process::exit(1); + }); + + let (tx, rx) = mpsc::channel::<Event>(); + + let handle = std::thread::spawn({ + let config = config.clone(); + move || backend::run(&config, &tx) + }); + + for event in rx { + render_event(&event); + } + + handle.join().unwrap() +} + +pub fn parse_args() -> Result<Config> { + #[derive(Parser)] + #[command(about, long_about)] + struct Cli { + /// URL to download + url: Option<String>, + + /// Path to a text file containing a list of URLs (one per line) + #[arg(short = 'l', long = "list")] + list: Option<PathBuf>, + + /// Download thumbnail images, default: false + #[arg(short = 't', long = "thumb", default_value_t = false)] + thumb: bool, + + /// Download files (images, videos, gifs, etc), default: false + #[arg(short = 'f', long = "files", default_value_t = false)] + files: bool, + + /// Resume files and thumbnails downloading instead of overwriting. Useless if neither -t nor -f are set, default: false + #[arg(short = 'r', long = "resume", default_value_t = false)] + resume: bool + } + let cli = Cli::parse(); + + let mut urls = Vec::new(); + // [URL] + if let Some(url) = cli.url { + urls.push(url); + } + // [List] + if let Some(list) = cli.list { + for line in std::fs::read_to_string(list)?.lines() { + urls.push(line.to_string()); + } + } + if urls.is_empty() { + anyhow::bail!("No URLs provided"); + } + + Ok(Config { + urls, + thumb: cli.thumb, + files: cli.files, + resume: cli.resume, + }) +} + +fn render_event(event: &Event) { + use std::io::Write; + match event { + Event::ThreadStarted { url, index, total } => + println!("Processing {} ({} / {}):", url, index, total), + + Event::ThreadDone { url, elapsed_ms } => + println!("Done processing {} ({} ms)", url, elapsed_ms), + + Event::ThreadFailed { url, error } => + eprintln!("Error processing {}: {}", url, error), + + Event::FetchStarted { .. } => { + print!("\tGetting thread..."); + std::io::stdout().flush().ok(); + } + + Event::FetchDone { elapsed_ms } => + println!(" Done ({} ms)", elapsed_ms), + + Event::FetchRetrying { url, attempt, max_attempts, error } => { + eprintln!("\n\tHTTP request failed for {}: {}", url, error); + if attempt < max_attempts { + eprintln!("\tWaiting 3 seconds..."); + } + } + + Event::ParseStarted => { + print!("\tParsing posts..."); + std::io::stdout().flush().ok(); + } + + Event::ParseDone { elapsed_ms, .. } => + println!(" Done ({} ms)", elapsed_ms), + + Event::DownloadBatchStarted { label, total_posts } => { + print!("\tDownloading {}... post 0 / {}", label, total_posts); + std::io::stdout().flush().ok(); + } + + Event::DownloadBatchProgress { label, done, total } => { + print!("\r\tDownloading {}... post {} / {}", label, done, total); + std::io::stdout().flush().ok(); + } + + Event::DownloadAssetFailed { label, filename, error, .. } => + println!("\r\tFailed to download {} {}: {}\n\t-> Waiting 3 seconds...", label, filename, error), + + Event::DownloadAssetSkipped { label, filename } => + println!("\tSkipping {} {} after 3 failed attempts.", label, filename), + + Event::DownloadBatchDone { elapsed_ms, .. } => + println!(" Done ({} ms)", elapsed_ms), + } +} diff --git a/src/cli.rs b/src/cli.rs @@ -1,54 +0,0 @@ -use clap::Parser; -use anyhow::Result; - -use std::path::PathBuf; - -use arhivarch_downloader::config::Config; - -pub fn parse_args() -> Result<Config> { - #[derive(Parser)] - #[command(about, long_about)] - struct Cli { - /// URL to download - url: Option<String>, - - /// Path to a text file containing a list of URLs (one per line) - #[arg(short = 'l', long = "list")] - list: Option<PathBuf>, - - /// Download thumbnail images, default: false - #[arg(short = 't', long = "thumb", default_value_t = false)] - thumb: bool, - - /// Download files (images, videos, gifs, etc), default: false - #[arg(short = 'f', long = "files", default_value_t = false)] - files: bool, - - /// Resume files and thumbnails downloading instead of overwriting. Useless if neither -t nor -f are set, default: false - #[arg(short = 'r', long = "resume", default_value_t = false)] - resume: bool - } - let cli = Cli::parse(); - - let mut urls = Vec::new(); - // [URL] - if let Some(url) = cli.url { - urls.push(url); - } - // [List] - if let Some(list) = cli.list { - for line in std::fs::read_to_string(list)?.lines() { - urls.push(line.to_string()); - } - } - if urls.is_empty() { - anyhow::bail!("No URLs provided"); - } - - Ok(Config { - urls, - thumb: cli.thumb, - files: cli.files, - resume: cli.resume, - }) -} diff --git a/src/events.rs b/src/events.rs @@ -21,3 +21,25 @@ pub enum Event { DownloadAssetSkipped { label: String, filename: String }, DownloadBatchDone { label: String, elapsed_ms: u128 }, } + +use std::sync::mpsc; + +/// Sink for progress events emitted by the library. +/// Implement this to connect the library to any frontend. +pub trait Reporter: Send + Sync { + fn report(&self, event: Event); +} + +/// Blanket impl: mpsc::Sender<Event> is already a valid Reporter. +impl Reporter for mpsc::Sender<Event> { + fn report(&self, event: Event) { + self.send(event).ok(); + } +} + +/// No-op reporter — useful in tests or when progress output is not needed. +pub struct NullReporter; + +impl Reporter for NullReporter { + fn report(&self, _event: Event) {} +} diff --git a/src/export.rs b/src/export.rs @@ -1,47 +1,11 @@ -use std::sync::mpsc::Sender; - -use crate::{config::Config, events::Event, post::Post}; +use crate::{config::Config, events::{Event, Reporter}, http, post::{File, Post}, render}; use anyhow::{Result, Context}; -const TEMPLATE: &'static str = include_str!("../template.html"); - -fn html_escape(s: &str) -> String { - s.replace('&', "&amp;") - .replace('<', "&lt;") - .replace('>', "&gt;") - .replace('"', "&quot;") -} - -/// Converts plain post text to HTML. -/// - `>>id` → reply link anchor -/// - Lines starting with `>` (not `>>digit`) → greentext span -/// - `\n` → `<br>` -fn render_text_to_html(text: &str) -> String { - static RE_REPLY: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| { - regex::Regex::new(r"&gt;&gt;(\d+)").unwrap() - }); - - let lines: Vec<String> = text.split('\n').map(|line| { - let escaped = html_escape(line); - // Greentext: starts with > but not >>digit - let processed = if escaped.starts_with("&gt;") && !escaped.starts_with("&gt;&gt;") { - format!("<span class=\"quote\">{}</span>", escaped) - } else { - escaped - }; - // Reply links: >>id - RE_REPLY.replace_all(&processed, |caps: &regex::Captures| { - let id = &caps[1]; - format!("<a href=\"#post{}\" class=\"reply-link\">&gt;&gt;{}</a>", id, id) - }).into_owned() - }).collect(); - - lines.join("<br>\n") -} +const TEMPLATE: &str = include_str!("../template.html"); /// Write a top-level index.html with one entry per thread (first post + link to thread folder) -pub(crate) fn write_index_html(first_posts: &[Post], config: &Config) -> Result<()> { +pub fn write_index_html(first_posts: &[Post], config: &Config) -> Result<()> { if first_posts.is_empty() { return Ok(()); } @@ -49,7 +13,7 @@ pub(crate) fn write_index_html(first_posts: &[Post], config: &Config) -> Result< let posts_html: String = first_posts .iter() .map(|p| { - let mut post_html = render_post(p, config.files, config.thumb); + let mut post_html = render::render_post(p, config.files, config.thumb); // render_post references thumbnails and images in the same directory, // so replace them with links to the thread folder config.files.then(|| post_html = post_html.replace( @@ -80,7 +44,7 @@ pub(crate) fn write_index_html(first_posts: &[Post], config: &Config) -> Result< /// If download_thumbnails is true, downloads thumbnails to ./{thread_id}/thumb /// /// WARNING: If the directory already exists, it will be overwritten -pub(crate) fn export2html(posts: &[Post], config: &Config, tx: &Sender<Event>) -> Result<()> { +pub fn export2html(posts: &[Post], config: &Config, reporter: &dyn Reporter) -> Result<()> { if posts.is_empty() { anyhow::bail!("No posts to export"); } @@ -90,7 +54,7 @@ pub(crate) fn export2html(posts: &[Post], config: &Config, tx: &Sender<Event>) - let posts_html: String = posts .iter() - .map(|p| render_post(p, config.files, config.thumb)) + .map(|p| render::render_post(p, config.files, config.thumb)) .collect::<Vec<String>>() .join("\n"); @@ -101,7 +65,7 @@ pub(crate) fn export2html(posts: &[Post], config: &Config, tx: &Sender<Event>) - "files", |f| &f.url, config.resume, - tx, + reporter, )?; } if config.thumb { @@ -111,7 +75,7 @@ pub(crate) fn export2html(posts: &[Post], config: &Config, tx: &Sender<Event>) - "thumbnails", |f| &f.url_thumb, config.resume, - tx, + reporter, )?; } @@ -121,112 +85,22 @@ pub(crate) fn export2html(posts: &[Post], config: &Config, tx: &Sender<Event>) - Ok(()) } -fn render_post(post: &Post, download_files: bool, download_thumbnails: bool) -> String { - let mut html = format!("<div class=\"post\" id=\"post{}\">\n", post.id); - - html.push_str(" <div class=\"post-head\">\n"); - - // Subject - if let Some(ref subject) = post.subject { - html.push_str(&format!( - " <span class=\"post-subject\">{}</span>\n", - html_escape(subject) - )); - } - - // Name /w mailto/sage - let name = post.name.as_deref().unwrap_or("Аноним"); - let name_display = if let Some(ref mailto) = post.mailto { - format!("[{}] {}", mailto, name) - } else { - name.to_string() - }; - html.push_str(&format!( - " <span class=\"post-name\">{}</span>\n", - html_escape(&name_display) - )); - - // Time, num, id - html.push_str(&format!(" <span class=\"post-time\">{}</span>\n", html_escape(&post.time))); - html.push_str(&format!(" <span class=\"post-num\">{}</span>\n", html_escape(&post.num))); - html.push_str(&format!( - " <span class=\"post-id\"><a href=\"#post{0}\">№{0}</a></span>\n", - post.id - )); - - html.push_str(" </div>\n"); - - // Images - html.push_str(&render_images(&post.files, download_files, download_thumbnails)); - - // Body - html.push_str(" <div class=\"post-body\">\n"); - if !post.text.is_empty() { - html.push_str(" "); - html.push_str(&render_text_to_html(&post.text)); - html.push('\n'); - } - html.push_str(" </div>\n"); - - html.push_str("</div>\n"); - html -} - -fn render_images( - files: &[crate::file::File], - download_files: bool, - download_thumbnails: bool, -) -> String { - if files.is_empty() { - return String::new(); - } - - let mut html = String::from(" <div class=\"post-images\">\n"); - for file in files { - let href = if download_files && !file.url.is_empty() { - format!("files/{}", file.url.split('/').last().unwrap_or("")) - } else { - file.url.clone() - }; - - let thumb_filename = file.url_thumb.split('/').last().unwrap_or("").to_string(); - let img_src = if download_thumbnails && !file.url_thumb.is_empty() { - format!("thumb/{}", thumb_filename) - } else { - file.url_thumb.clone() - }; - - html.push_str(&format!( - " <div class=\"post-image\">\n <a href=\"{}\" target=\"_blank\" title=\"{}\">\n <img src=\"{}\" alt=\"\" loading=\"lazy\">\n </a>\n <div class=\"post-image-info\">{} (<a href=\"{}\" target=\"_blank\" class=\"post-image-link\">o</a>, <a href=\"{}\" target=\"_blank\" class=\"post-image-link\">t</a>)</div>\n </div>\n", - html_escape(&href), - html_escape(&file.name_orig), - html_escape(&img_src), - html_escape(&file.name_orig), - html_escape(&file.url), - html_escape(&file.url_thumb), - )); - } - html.push_str(" </div>\n"); - html -} - - fn download_assets( posts: &[Post], dest_dir: &str, label: &str, - url_of: impl Fn(&crate::file::File) -> &str, + url_of: impl Fn(&File) -> &str, skip_if_exists: bool, - tx: &Sender<Event>, + reporter: &dyn Reporter, ) -> Result<()> { std::fs::create_dir_all(dest_dir) .with_context(|| format!("Failed to create directory {}", dest_dir))?; let t = std::time::Instant::now(); - tx.send(Event::DownloadBatchStarted { + reporter.report(Event::DownloadBatchStarted { label: label.to_string(), total_posts: posts.len(), - }).ok(); + }); for (i, post) in posts.iter().enumerate() { for f in &post.files { @@ -238,45 +112,35 @@ fn download_assets( } let mut result = Err(anyhow::anyhow!("no attempts")); for attempt in 1..=3u32 { - result = download(url, &path); + result = http::download(url, &path); if result.is_ok() { break; } let e = result.as_ref().unwrap_err(); - tx.send(Event::DownloadAssetFailed { + reporter.report(Event::DownloadAssetFailed { label: label.to_string(), filename: filename.clone(), attempt, error: e.to_string(), - }).ok(); + }); std::thread::sleep(std::time::Duration::from_secs(3)); } if result.is_err() { - tx.send(Event::DownloadAssetSkipped { + reporter.report(Event::DownloadAssetSkipped { label: label.to_string(), filename: filename.clone(), - }).ok(); + }); } } - tx.send(Event::DownloadBatchProgress { + reporter.report(Event::DownloadBatchProgress { label: label.to_string(), done: i + 1, total: posts.len(), - }).ok(); + }); } - tx.send(Event::DownloadBatchDone { + reporter.report(Event::DownloadBatchDone { label: label.to_string(), elapsed_ms: t.elapsed().as_millis(), - }).ok(); - - Ok(()) -} + }); -fn download(url: &str, path: &str) -> Result<()> { - let bytes = reqwest::blocking::get(url) - .with_context(|| format!("HTTP GET failed for {}", url))? - .bytes() - .context("failed to read response body")?; - std::fs::write(path, &bytes) - .with_context(|| format!("failed to write {}", path))?; Ok(()) } diff --git a/src/file.rs b/src/file.rs @@ -1,21 +0,0 @@ -#[derive(Debug, Clone)] -pub struct File { - /// original name, "videolol.mp4" - pub name_orig: String, - /// timestampname, "17699100670710.mp4" - pub name_timestamp: String, - /// thumbnail url, "https://arhivach.vc/storage/t/aeaa7825f8d8ffe3f07f242a59b7761c.thumb" - pub url_thumb: String, - /// url, "https://i.arhivach.vc/storage/a/ea/aeaa7825f8d8ffe3f07f242a59b7761c.mp4" - pub url: String, -} - -impl std::fmt::Display for File { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{} [{}]\n url: {}\n thumb: {}", - self.name_orig, self.name_timestamp, self.url, self.url_thumb - ) - } -} -\ No newline at end of file diff --git a/src/http.rs b/src/http.rs @@ -0,0 +1,35 @@ +use anyhow::{Context, Result}; + +use crate::events::{Event, Reporter}; + +/// GET a URL with up to `attempts` retries, reporting each failure via `reporter`. +pub fn fetch_with_retry(url: &str, attempts: u32, reporter: &dyn Reporter) -> Result<String> { + for attempt in 1..=attempts { + match reqwest::blocking::get(url).and_then(|r| r.text()) { + Ok(text) => return Ok(text), + Err(e) => { + reporter.report(Event::FetchRetrying { + url: url.to_string(), + attempt, + max_attempts: attempts, + error: e.to_string(), + }); + if attempt < attempts { + std::thread::sleep(std::time::Duration::from_secs(3)); + } + } + } + } + anyhow::bail!("failed to get thread after {attempts} attempts") +} + +/// Download a single URL and write it to `path`. +pub fn download(url: &str, path: &str) -> Result<()> { + let bytes = reqwest::blocking::get(url) + .with_context(|| format!("HTTP GET failed for {}", url))? + .bytes() + .context("failed to read response body")?; + std::fs::write(path, &bytes) + .with_context(|| format!("failed to write {}", path))?; + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs @@ -2,5 +2,8 @@ pub mod config; pub mod events; pub mod backend; pub mod post; -pub mod file; -pub(crate) mod export; +pub mod http; +pub mod render; +pub mod export; + +pub use events::{Reporter, NullReporter}; diff --git a/src/main.rs b/src/main.rs @@ -1,80 +0,0 @@ -mod cli; - -use arhivarch_downloader::{backend, events::Event}; -use std::sync::mpsc; - -fn main() -> anyhow::Result<()> { - let config = cli::parse_args().unwrap_or_else(|e| { - eprintln!("Error: {}", e); - std::process::exit(1); - }); - - let (tx, rx) = mpsc::channel::<Event>(); - - let handle = std::thread::spawn({ - let config = config.clone(); - move || backend::run(&config, tx) - }); - - for event in rx { - render_event(&event); - } - - handle.join().unwrap() -} - -fn render_event(event: &Event) { - use std::io::Write; - match event { - Event::ThreadStarted { url, index, total } => - println!("Processing {} ({} / {}):", url, index, total), - - Event::ThreadDone { url, elapsed_ms } => - println!("Done processing {} ({} ms)", url, elapsed_ms), - - Event::ThreadFailed { url, error } => - eprintln!("Error processing {}: {}", url, error), - - Event::FetchStarted { .. } => { - print!("\tGetting thread..."); - std::io::stdout().flush().ok(); - } - - Event::FetchDone { elapsed_ms } => - println!(" Done ({} ms)", elapsed_ms), - - Event::FetchRetrying { url, attempt, max_attempts, error } => { - eprintln!("\n\tHTTP request failed for {}: {}", url, error); - if attempt < max_attempts { - eprintln!("\tWaiting 3 seconds..."); - } - } - - Event::ParseStarted => { - print!("\tParsing posts..."); - std::io::stdout().flush().ok(); - } - - Event::ParseDone { elapsed_ms, .. } => - println!(" Done ({} ms)", elapsed_ms), - - Event::DownloadBatchStarted { label, total_posts } => { - print!("\tDownloading {}... post 0 / {}", label, total_posts); - std::io::stdout().flush().ok(); - } - - Event::DownloadBatchProgress { label, done, total } => { - print!("\r\tDownloading {}... post {} / {}", label, done, total); - std::io::stdout().flush().ok(); - } - - Event::DownloadAssetFailed { label, filename, error, .. } => - println!("\r\tFailed to download {} {}: {}\n\t-> Waiting 3 seconds...", label, filename, error), - - Event::DownloadAssetSkipped { label, filename } => - println!("\tSkipping {} {} after 3 failed attempts.", label, filename), - - Event::DownloadBatchDone { elapsed_ms, .. } => - println!(" Done ({} ms)", elapsed_ms), - } -} diff --git a/src/post.rs b/src/post.rs @@ -1,7 +1,38 @@ -use crate::file::File; - use anyhow::{Context, Result}; +const BASE_URL: &str = "https://arhivach.vc"; + +#[derive(Debug, Clone)] +pub struct File { + /// original name, "videolol.mp4" + pub name_orig: String, + /// timestampname, "17699100670710.mp4" + pub name_timestamp: String, + /// thumbnail url, "https://arhivach.vc/storage/t/aeaa7825f8d8ffe3f07f242a59b7761c.thumb" + pub url_thumb: String, + /// url, "https://i.arhivach.vc/storage/a/ea/aeaa7825f8d8ffe3f07f242a59b7761c.mp4" + pub url: String, +} + +impl std::fmt::Display for File { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{} [{}]\n url: {}\n thumb: {}", + self.name_orig, self.name_timestamp, self.url, self.url_thumb + ) + } +} + +struct PostHead { + subject: Option<String>, + name: Option<String>, + mailto: Option<String>, + time: String, + num: String, + id: u32, +} + /// Represents a single post in a thread #[derive(Debug, Clone)] pub struct Post { @@ -59,7 +90,7 @@ impl Post { .select(&SEL_POST_HEAD) .next() .context("missing post_head")?; - let (subject, name, mailto, time, num, id) = Post::parse_post_head(post_head)?; + let head = Post::parse_post_head(post_head)?; let post_comment = node .select(&SEL_POST_IMAGE_BLOCK) @@ -68,12 +99,12 @@ impl Post { let (files, text) = Post::parse_post_comment(post_comment)?; Ok(Post { - subject, - name, - mailto, - time, - num, - id, + subject: head.subject, + name: head.name, + mailto: head.mailto, + time: head.time, + num: head.num, + id: head.id, files, text, }) @@ -96,18 +127,7 @@ impl Post { /// </span> &nbsp; /// </div> /// ``` - fn parse_post_head( - post_head: scraper::ElementRef - ) -> Result< - ( - Option<String>, // subject - Option<String>, // name - Option<String>, // mailto - String, // time - String, // num - u32 // id - ) - > { + fn parse_post_head(post_head: scraper::ElementRef) -> Result<PostHead> { static SEL_SPAN_POST_ID_A_HREF: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( || scraper::Selector::parse("span.post_id a[href]").unwrap() ); @@ -166,7 +186,7 @@ impl Post { .text() .collect::<String>(); - Ok((subject, name, mailto, time, num, id)) + Ok(PostHead { subject, name, mailto, time, num, id }) } /// Parses the sapn post_comment element from a post element @@ -248,7 +268,7 @@ impl Post { let url_thumb = if url_thumb.is_empty() { String::new() } else { - format!("https://arhivach.vc{}", url_thumb) + format!("{BASE_URL}{url_thumb}") }; // url @@ -262,7 +282,7 @@ impl Post { } else if url.is_empty() { String::new() } else { - format!("https://arhivach.vc{}", url) + format!("{BASE_URL}{url}") }; File { diff --git a/src/render.rs b/src/render.rs @@ -0,0 +1,125 @@ +use crate::post::{File, Post}; + +fn html_escape(s: &str) -> String { + s.replace('&', "&amp;") + .replace('<', "&lt;") + .replace('>', "&gt;") + .replace('"', "&quot;") +} + +/// Converts plain post text to HTML. +/// - `>>id` → reply link anchor +/// - Lines starting with `>` (not `>>digit`) → greentext span +/// - `\n` → `<br>` +pub fn render_text_to_html(text: &str) -> String { + static RE_REPLY: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| { + regex::Regex::new(r"&gt;&gt;(\d+)").unwrap() + }); + + let lines: Vec<String> = text.split('\n').map(|line| { + let escaped = html_escape(line); + // Greentext: starts with > but not >>digit + let processed = if escaped.starts_with("&gt;") && !escaped.starts_with("&gt;&gt;") { + format!("<span class=\"quote\">{}</span>", escaped) + } else { + escaped + }; + // Reply links: >>id + RE_REPLY.replace_all(&processed, |caps: &regex::Captures| { + let id = &caps[1]; + format!("<a href=\"#post{}\" class=\"reply-link\">&gt;&gt;{}</a>", id, id) + }).into_owned() + }).collect(); + + lines.join("<br>\n") +} + +/// Renders a single post to an HTML fragment string. +pub fn render_post(post: &Post, download_files: bool, download_thumbnails: bool) -> String { + let mut html = format!("<div class=\"post\" id=\"post{}\">\n", post.id); + + html.push_str(" <div class=\"post-head\">\n"); + + // Subject + if let Some(ref subject) = post.subject { + html.push_str(&format!( + " <span class=\"post-subject\">{}</span>\n", + html_escape(subject) + )); + } + + // Name /w mailto/sage + let name = post.name.as_deref().unwrap_or("Аноним"); + let name_display = if let Some(ref mailto) = post.mailto { + format!("[{}] {}", mailto, name) + } else { + name.to_string() + }; + html.push_str(&format!( + " <span class=\"post-name\">{}</span>\n", + html_escape(&name_display) + )); + + // Time, num, id + html.push_str(&format!(" <span class=\"post-time\">{}</span>\n", html_escape(&post.time))); + html.push_str(&format!(" <span class=\"post-num\">{}</span>\n", html_escape(&post.num))); + html.push_str(&format!( + " <span class=\"post-id\"><a href=\"#post{0}\">№{0}</a></span>\n", + post.id + )); + + html.push_str(" </div>\n"); + + // Images + html.push_str(&render_images(&post.files, download_files, download_thumbnails)); + + // Body + html.push_str(" <div class=\"post-body\">\n"); + if !post.text.is_empty() { + html.push_str(" "); + html.push_str(&render_text_to_html(&post.text)); + html.push('\n'); + } + html.push_str(" </div>\n"); + + html.push_str("</div>\n"); + html +} + +fn render_images( + files: &[File], + download_files: bool, + download_thumbnails: bool, +) -> String { + if files.is_empty() { + return String::new(); + } + + let mut html = String::from(" <div class=\"post-images\">\n"); + for file in files { + let href = if download_files && !file.url.is_empty() { + format!("files/{}", file.url.split('/').last().unwrap_or("")) + } else { + file.url.clone() + }; + + let thumb_filename = file.url_thumb.split('/').last().unwrap_or("").to_string(); + let img_src = if download_thumbnails && !file.url_thumb.is_empty() { + format!("thumb/{}", thumb_filename) + } else { + file.url_thumb.clone() + }; + + html.push_str(&format!( + " <div class=\"post-image\">\n <a href=\"{}\" target=\"_blank\" title=\"{}\">\n <img src=\"{}\" alt=\"\" loading=\"lazy\">\n </a>\n <div class=\"post-image-info\">{} (<a href=\"{}\" target=\"_blank\" class=\"post-image-link\">o</a>, <a href=\"{}\" target=\"_blank\" class=\"post-image-link\">t</a>)</div>\n </div>\n", + html_escape(&href), + html_escape(&file.name_orig), + html_escape(&img_src), + html_escape(&file.name_orig), + html_escape(&file.url), + html_escape(&file.url_thumb), + )); + } + html.push_str(" </div>\n"); + html +}