arhivach-downloader

Download arhivach.vc threads
git clone https://git.ea.contact/arhivach-downloader
Log | Files | Refs | README

commit 766cc139ec89a1ac4d6c4da3bbbb5398fe493856
parent 6815f58962d8337ca8590c4af9218ded1ff11752
Author: egor-achkasov <eaachkasov@gmail.com>
Date:   Mon,  9 Mar 2026 01:02:45 +0000

Complete refactor to separte front and back; ecapsulate export

Diffstat:
MCargo.toml | 3+++
MREADME.md | 32++++++++++++++++----------------
Dsrc/backend.rs | 59-----------------------------------------------------------
Msrc/bin/cli/main.rs | 153+++++++++++++++++++++++++++++++++++++++----------------------------------------
Dsrc/config.rs | 7-------
Dsrc/events.rs | 45---------------------------------------------
Dsrc/export/html/mod.rs | 142-------------------------------------------------------------------------------
Dsrc/export/html/render.rs | 139-------------------------------------------------------------------------------
Dsrc/export/mod.rs | 8--------
Dsrc/http.rs | 35-----------------------------------
Dsrc/lib.rs | 9---------
Asrc/lib/config.rs | 12++++++++++++
Asrc/lib/download.rs | 25+++++++++++++++++++++++++
Asrc/lib/event.rs | 29+++++++++++++++++++++++++++++
Asrc/lib/export/html/mod.rs | 27+++++++++++++++++++++++++++
Asrc/lib/export/html/render.rs | 140+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rsrc/export/html/template.html -> src/lib/export/html/template.html | 0
Asrc/lib/export/mod.rs | 35+++++++++++++++++++++++++++++++++++
Asrc/lib/lib.rs | 96+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/lib/post.rs | 374+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/post.rs | 374-------------------------------------------------------------------------------
21 files changed, 833 insertions(+), 911 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml @@ -4,6 +4,9 @@ description = "Download threads from arhivach." version = "0.1.0" edition = "2024" +[lib] +path = "src/lib/lib.rs" + [[bin]] name = "arhivach-downloader-cli" path = "src/bin/cli/main.rs" diff --git a/README.md b/README.md @@ -9,28 +9,28 @@ Download threads from arhivach.vc and save them locally for offline access or pr `arhivach-downloader --help`: ``` -Download threads from arhivach. - -Usage: arhivarch-downloader.exe [OPTIONS] [URL] +Usage: arhivarch-downloader-cli.exe [OPTIONS] <URL> Arguments: - [URL] URL to download + <URL> URL to download Options: - -l, --list <LIST> Path to a text file containing a list of URLs (one per line) - -t, --thumb Download thumbnail images, default: false - -f, --files Download files (images, videos, gifs, etc), default: false - -r, --resume Resume files and thumbnails downloading instead of overwriting. Useless if neither -t nor -f are set, default: false - -h, --help Print help + -d, --dir <DIR> Path to download directory [default: .] + -e, --exporter <EXPORTER> Exporter [default: html] [possible values: html] + -t, --thumb Download thumbnail images, default: false + -f, --files Download files (images, videos, gifs, etc), default: false + -r, --resume Resume files and thumbnails downloading instead of overwriting. Useless if neither -t nor -f are set, default: false + -R, --retries <RETRIES> Download retries in case of a error [default: 3] + -h, --help Print help ``` -Each thread will be downloaded in a directory named by an OP №. Contents: -- index.html -- the thread. Open it with your web browser. -- files directory (if -f (--files) argument is given) -- all the files original attached to posts. Might be heavy if there are many videos. -- thumb directory (if -t (--thumb) argument is given) -- all the thumbnails needed to render file previews in the thread. +Creates a subdirectory named after the arhivach thread id (the number after `/thread/` in the URL) inside the download directory, and saves the thread there. Contents: +- `index.html` — the thread. Open it with your web browser. +- `files/` (if `-f`/`--files` is given) — original files attached to posts. May be large if there are many videos. +- `thumb/` (if `-t`/`--thumb` is given) — thumbnails needed to render file previews in the thread. -Main index.html will be created in the current directory to feature the first posts of the downloaded threads. +Use `-r`/`--resume` to skip files and thumbnails that are already downloaded. -Note that you may pass an URL directly as an argument, pass a path to a text file with URLs via -f, or both. +Use `-d`/`--dir` to specify where to create the thread directory (defaults to the current directory). -Use -r (--resume) to skip downloading files and thumbnails that are already there. +Use `-R`/`--retries` to control how many times a failed download is retried (default: 3). diff --git a/src/backend.rs b/src/backend.rs @@ -1,59 +0,0 @@ -use anyhow::{Context, Ok, Result}; -use std::result::Result::Ok as StdOk; - -use crate::{config::Config, events::{Event, Reporter}, export::{Export, html}, http, post::Post}; - -pub fn scrape_thread(url: &str, config: &Config, reporter: &dyn Reporter, exporter: &dyn Export) -> Result<Post> { - let t_total = std::time::Instant::now(); - - reporter.report(Event::FetchStarted { url: url.to_string() }); - let t = std::time::Instant::now(); - let html_content = http::fetch_with_retry(url, 3, reporter)?; - reporter.report(Event::FetchDone { elapsed_ms: t.elapsed().as_millis() }); - - reporter.report(Event::ParseStarted); - let t = std::time::Instant::now(); - let posts = Post::parse_posts(&html_content).context("failed to parse thread HTML")?; - reporter.report(Event::ParseDone { - post_count: posts.len(), - elapsed_ms: t.elapsed().as_millis(), - }); - - let first_post = posts.first().context("thread has no posts")?.clone(); - - exporter.export(&posts, config, reporter).context("failed to export thread")?; - - reporter.report(Event::ThreadDone { - url: url.to_string(), - elapsed_ms: t_total.elapsed().as_millis(), - }); - - Ok(first_post) -} - -pub fn run(config: &Config, reporter: &dyn Reporter, exporter: &dyn Export) -> Result<()> { - let total = config.urls.len(); - let mut first_posts: Vec<Post> = Vec::new(); - - for (i, url) in config.urls.iter().enumerate() { - reporter.report(Event::ThreadStarted { - url: url.clone(), - index: i + 1, - total, - }); - - match scrape_thread(url, config, reporter, exporter) { - StdOk(first_post) => first_posts.push(first_post), - Err(e) => { - reporter.report(Event::ThreadFailed { - url: url.clone(), - error: format!("{:#}", e), - }); - } - } - } - - html::write_index_html(&first_posts, config).context("failed to write main index.html")?; - - Ok(()) -} diff --git a/src/bin/cli/main.rs b/src/bin/cli/main.rs @@ -1,41 +1,47 @@ -use arhivarch_downloader::{backend, events::Event, config::Config, HtmlExporter}; +use arhivarch_downloader::config::Config; +use arhivarch_downloader::event::Event; +use arhivarch_downloader::export::{html::HtmlExporter, ExporterKind}; -use clap::Parser; -use anyhow::Result; +use clap::{Parser, ValueEnum}; use std::path::PathBuf; -use std::sync::mpsc; -fn main() -> anyhow::Result<()> { - let config = parse_args().unwrap_or_else(|e| { - eprintln!("Error: {}", e); - std::process::exit(1); - }); - - let (tx, rx) = mpsc::channel::<Event>(); +#[derive(Clone, ValueEnum)] +enum ExporterArg { + Html, +} +use std::sync::mpsc::channel; +fn main() -> anyhow::Result<()> { + let config = parse_args(); + let (tx, rx) = channel::<Event>(); let handle = std::thread::spawn({ let config = config.clone(); - move || backend::run(&config, &tx, &HtmlExporter) + move || arhivarch_downloader::run(&config, tx) }); for event in rx { render_event(&event); } - handle.join().unwrap() + let _ = handle.join().map_err(|e| anyhow::anyhow!("{:?}", e))?; + Ok(()) } -pub fn parse_args() -> Result<Config> { +pub fn parse_args() -> Config { #[derive(Parser)] #[command(about, long_about)] struct Cli { /// URL to download - url: Option<String>, + url: String, + + /// Path to download directory + #[arg(short = 'd', long = "dir", value_name = "DIR", default_value = ".", value_hint = clap::ValueHint::DirPath)] + dir: PathBuf, - /// Path to a text file containing a list of URLs (one per line) - #[arg(short = 'l', long = "list")] - list: Option<PathBuf>, + /// Exporter + #[arg(short = 'e', long = "exporter", value_name = "EXPORTER", default_value = "html")] + exporter: ExporterArg, /// Download thumbnail images, default: false #[arg(short = 't', long = "thumb", default_value_t = false)] @@ -47,85 +53,78 @@ pub fn parse_args() -> Result<Config> { /// Resume files and thumbnails downloading instead of overwriting. Useless if neither -t nor -f are set, default: false #[arg(short = 'r', long = "resume", default_value_t = false)] - resume: bool - } - let cli = Cli::parse(); + resume: bool, - let mut urls = Vec::new(); - // [URL] - if let Some(url) = cli.url { - urls.push(url); - } - // [List] - if let Some(list) = cli.list { - for line in std::fs::read_to_string(list)?.lines() { - urls.push(line.to_string()); - } - } - if urls.is_empty() { - anyhow::bail!("No URLs provided"); + /// Download retries in case of a error + #[arg(short = 'R', long = "retries", default_value_t = 3)] + download_retries: u32, } + let cli = Cli::parse(); - Ok(Config { - urls, + Config { + url: cli.url, + dir: cli.dir, + exporter: match cli.exporter { + ExporterArg::Html => ExporterKind::Html(HtmlExporter), + }, thumb: cli.thumb, files: cli.files, resume: cli.resume, - }) + download_retries: cli.download_retries, + } } fn render_event(event: &Event) { use std::io::Write; match event { - Event::ThreadStarted { url, index, total } => - println!("Processing {} ({} / {}):", url, index, total), - - Event::ThreadDone { url, elapsed_ms } => - println!("Done processing {} ({} ms)", url, elapsed_ms), - - Event::ThreadFailed { url, error } => - eprintln!("Error processing {}: {}", url, error), - - Event::FetchStarted { .. } => { - print!("\tGetting thread..."); + Event::GetStarted => { + print!("Fetching thread..."); std::io::stdout().flush().ok(); } - - Event::FetchDone { elapsed_ms } => - println!(" Done ({} ms)", elapsed_ms), - - Event::FetchRetrying { url, attempt, max_attempts, error } => { - eprintln!("\n\tHTTP request failed for {}: {}", url, error); - if attempt < max_attempts { - eprintln!("\tWaiting 3 seconds..."); - } + Event::GetDone => + println!(" Done."), + Event::GetFailed { error } => + eprintln!("\nFailed to fetch thread: {}", error), + + Event::DownloadAllStarted => + println!("Downloading stuff..."), + Event::DownloadAllDone => + println!("All downloads complete."), + Event::DownloadAllFailed { error } => + eprintln!("Download failed: {}", error), + + Event::DownloadStarted { index, max_index } => { + print!("\r\tDownloading {} / {}...", index, max_index); + std::io::stdout().flush().ok(); } - - Event::ParseStarted => { - print!("\tParsing posts..."); + Event::DownloadDone { index, max_index } => { + println!("\r\tDownloading {} / {}... Done.", index, max_index); + } + Event::DownloadFailed { url, error } => + eprintln!("\r\tFailed to download {}: {}", url, error), + Event::DownloadSkipped { index, max_index } => + println!("\r\tDownloading {} / {}... Skipped.", index, max_index), + + Event::DownloadFilesStarted => { + println!("Downloading files..."); std::io::stdout().flush().ok(); } - - Event::ParseDone { elapsed_ms, .. } => - println!(" Done ({} ms)", elapsed_ms), - - Event::DownloadBatchStarted { label, total_posts } => { - print!("\tDownloading {}... post 0 / {}", label, total_posts); + Event::DownloadFilesDone => + println!("Done."), + Event::DownloadThumbStarted => { + println!("Downloading thumbnails..."); std::io::stdout().flush().ok(); } + Event::DownloadThumbDone => + println!("Done."), - Event::DownloadBatchProgress { label, done, total } => { - print!("\r\tDownloading {}... post {} / {}", label, done, total); + Event::ExportStarted => { + print!("Exporting..."); std::io::stdout().flush().ok(); } - - Event::DownloadAssetFailed { label, filename, error, .. } => - println!("\r\tFailed to download {} {}: {}\n\t-> Waiting 3 seconds...", label, filename, error), - - Event::DownloadAssetSkipped { label, filename } => - println!("\tSkipping {} {} after 3 failed attempts.", label, filename), - - Event::DownloadBatchDone { elapsed_ms, .. } => - println!(" Done ({} ms)", elapsed_ms), + Event::ExportDone => + println!(" Done."), + Event::ExportFailed { error } => + eprintln!("\nExport failed: {}", error), } } diff --git a/src/config.rs b/src/config.rs @@ -1,7 +0,0 @@ -#[derive(Debug, Clone)] -pub struct Config { - pub urls: Vec<String>, - pub thumb: bool, - pub files: bool, - pub resume: bool, -} diff --git a/src/events.rs b/src/events.rs @@ -1,45 +0,0 @@ -#[derive(Debug, Clone)] -pub enum Event { - // Thread-level lifecycle - ThreadStarted { url: String, index: usize, total: usize }, - ThreadDone { url: String, elapsed_ms: u128 }, - ThreadFailed { url: String, error: String }, - - // HTTP fetch - FetchStarted { url: String }, - FetchDone { elapsed_ms: u128 }, - FetchRetrying { url: String, attempt: u32, max_attempts: u32, error: String }, - - // HTML parsing - ParseStarted, - ParseDone { post_count: usize, elapsed_ms: u128 }, - - // Asset downloading - DownloadBatchStarted { label: String, total_posts: usize }, - DownloadBatchProgress { label: String, done: usize, total: usize }, - DownloadAssetFailed { label: String, filename: String, attempt: u32, error: String }, - DownloadAssetSkipped { label: String, filename: String }, - DownloadBatchDone { label: String, elapsed_ms: u128 }, -} - -use std::sync::mpsc; - -/// Sink for progress events emitted by the library. -/// Implement this to connect the library to any frontend. -pub trait Reporter: Send + Sync { - fn report(&self, event: Event); -} - -/// Blanket impl: mpsc::Sender<Event> is already a valid Reporter. -impl Reporter for mpsc::Sender<Event> { - fn report(&self, event: Event) { - self.send(event).ok(); - } -} - -/// No-op reporter — useful in tests or when progress output is not needed. -pub struct NullReporter; - -impl Reporter for NullReporter { - fn report(&self, _event: Event) {} -} diff --git a/src/export/html/mod.rs b/src/export/html/mod.rs @@ -1,142 +0,0 @@ -use crate::{config::Config, events::{Event, Reporter}, http, post::{File, Post}}; -use anyhow::{Result, Context}; -use super::Export; - -mod render; - -const TEMPLATE: &str = include_str!("template.html"); - -pub struct HtmlExporter; - -impl Export for HtmlExporter { - fn export(&self, posts: &[Post], config: &Config, reporter: &dyn Reporter) -> Result<()> { - if posts.is_empty() { - anyhow::bail!("No posts to export"); - } - - let dir = format!("{}", posts[0].id); - std::fs::create_dir_all(&dir)?; - - let posts_html: String = posts - .iter() - .map(|p| render::render_post(p, config.files, config.thumb)) - .collect::<Vec<String>>() - .join("\n"); - - if config.files { - download_assets( - &posts, - &format!("{}/files", dir), - "files", - |f| &f.url, - config.resume, - reporter, - )?; - } - if config.thumb { - download_assets( - &posts, - &format!("{}/thumb", dir), - "thumbnails", - |f| &f.url_thumb, - config.resume, - reporter, - )?; - } - - let index_html = TEMPLATE.replace("{{posts}}", &posts_html); - std::fs::write(format!("{}/index.html", dir), index_html)?; - - Ok(()) - } -} - -/// Write a top-level index.html with one entry per thread (first post + link to thread folder) -pub fn write_index_html(first_posts: &[Post], config: &Config) -> Result<()> { - if first_posts.is_empty() { - return Ok(()); - } - - let posts_html: String = first_posts - .iter() - .map(|p| { - let mut post_html = render::render_post(p, config.files, config.thumb); - config.files.then(|| post_html = post_html.replace( - "<a href=\"files/", - &format!("<a href=\"{}/files/", p.id), - )); - config.thumb.then(|| post_html = post_html.replace( - "<img src=\"thumb/", - &format!("<img src=\"{}/thumb/", p.id), - )); - format!("<div><a href=\"{}/index.html\">В тред &rarr;</a></div>{}\n", p.id, post_html) - }) - .collect::<Vec<String>>() - .join("\n"); - - let index_html = TEMPLATE.replace("{{posts}}", &posts_html); - std::fs::write("index.html", index_html) - .context("failed to write index.html")?; - - Ok(()) -} - -fn download_assets( - posts: &[Post], - dest_dir: &str, - label: &str, - url_of: impl Fn(&File) -> &str, - skip_if_exists: bool, - reporter: &dyn Reporter, -) -> Result<()> { - std::fs::create_dir_all(dest_dir) - .with_context(|| format!("Failed to create directory {}", dest_dir))?; - - let t = std::time::Instant::now(); - reporter.report(Event::DownloadBatchStarted { - label: label.to_string(), - total_posts: posts.len(), - }); - - for (i, post) in posts.iter().enumerate() { - for f in &post.files { - let url = url_of(f); - let filename = url.split('/').last().unwrap_or("").to_string(); - let path = format!("{}/{}", dest_dir, filename); - if skip_if_exists && std::path::Path::new(&path).exists() { - continue; - } - let mut result = Err(anyhow::anyhow!("no attempts")); - for attempt in 1..=3u32 { - result = http::download(url, &path); - if result.is_ok() { break; } - let e = result.as_ref().unwrap_err(); - reporter.report(Event::DownloadAssetFailed { - label: label.to_string(), - filename: filename.clone(), - attempt, - error: e.to_string(), - }); - std::thread::sleep(std::time::Duration::from_secs(3)); - } - if result.is_err() { - reporter.report(Event::DownloadAssetSkipped { - label: label.to_string(), - filename: filename.clone(), - }); - } - } - reporter.report(Event::DownloadBatchProgress { - label: label.to_string(), - done: i + 1, - total: posts.len(), - }); - } - - reporter.report(Event::DownloadBatchDone { - label: label.to_string(), - elapsed_ms: t.elapsed().as_millis(), - }); - - Ok(()) -} diff --git a/src/export/html/render.rs b/src/export/html/render.rs @@ -1,139 +0,0 @@ -use crate::thread::{File, Post}; - -fn html_escape(s: &str) -> String { - s.replace('&', "&amp;") - .replace('<', "&lt;") - .replace('>', "&gt;") - .replace('"', "&quot;") -} - -/// Converts plain post text to HTML. -/// - `>>id` → reply link anchor -/// - Lines starting with `>` (not `>>digit`) → greentext span -/// - `\n` → `<br>` -pub fn render_text_to_html(text: &str) -> String { - let needle = "&gt;&gt;"; - - let lines: Vec<String> = text.split('\n').map(|line| { - let escaped = html_escape(line); - - // Replace >>id with reply link anchors - let mut processed = String::with_capacity(escaped.len()); - let mut rest = escaped.as_str(); - while let Some(pos) = rest.find(needle) { - processed.push_str(&rest[..pos]); - let after = &rest[pos + needle.len()..]; - let digit_end = after.find(|c: char| !c.is_ascii_digit()).unwrap_or(after.len()); - if digit_end > 0 { - let id = &after[..digit_end]; - processed.push_str(&format!("<a href=\"#post{id}\" class=\"reply-link\">&gt;&gt;{id}</a>")); - rest = &after[digit_end..]; - } else { - processed.push_str(needle); - rest = after; - } - } - processed.push_str(rest); - - // Wrap in greentext span if line starts with > but not >>digit - let is_greentext = escaped.starts_with("&gt;") - && !escaped.strip_prefix(needle).is_some_and(|s| s.starts_with(|c: char| c.is_ascii_digit())); - if is_greentext { - format!("<span class=\"quote\">{processed}</span>") - } else { - processed - } - }).collect(); - - lines.join("<br>\n") -} - -/// Renders a single post to an HTML fragment string. -pub fn render_post(post: &Post, download_files: bool, download_thumbnails: bool) -> String { - let mut html = format!("<div class=\"post\" id=\"post{}\">\n", post.id); - - html.push_str(" <div class=\"post-head\">\n"); - - // Subject - if let Some(ref subject) = post.subject { - html.push_str(&format!( - " <span class=\"post-subject\">{}</span>\n", - html_escape(subject) - )); - } - - // Name /w mailto/sage - let name = post.name.as_deref().unwrap_or("Аноним"); - let name_display = if let Some(ref mailto) = post.mailto { - format!("[{}] {}", mailto, name) - } else { - name.to_string() - }; - html.push_str(&format!( - " <span class=\"post-name\">{}</span>\n", - html_escape(&name_display) - )); - - // Time, num, id - html.push_str(&format!(" <span class=\"post-time\">{}</span>\n", html_escape(&post.time))); - html.push_str(&format!(" <span class=\"post-num\">{}</span>\n", html_escape(&post.num))); - html.push_str(&format!( - " <span class=\"post-id\"><a href=\"#post{0}\">№{0}</a></span>\n", - post.id - )); - - html.push_str(" </div>\n"); - - // Images - html.push_str(&render_images(&post.files, download_files, download_thumbnails)); - - // Body - html.push_str(" <div class=\"post-body\">\n"); - if !post.text.is_empty() { - html.push_str(" "); - html.push_str(&render_text_to_html(&post.text)); - html.push('\n'); - } - html.push_str(" </div>\n"); - - html.push_str("</div>\n"); - html -} - -fn render_images( - files: &[File], - download_files: bool, - download_thumbnails: bool, -) -> String { - if files.is_empty() { - return String::new(); - } - - let mut html = String::from(" <div class=\"post-images\">\n"); - for file in files { - let href = if download_files && !file.url.is_empty() { - format!("files/{}", file.url.split('/').last().unwrap_or("")) - } else { - file.url.clone() - }; - - let thumb_filename = file.url_thumb.split('/').last().unwrap_or("").to_string(); - let img_src = if download_thumbnails && !file.url_thumb.is_empty() { - format!("thumb/{}", thumb_filename) - } else { - file.url_thumb.clone() - }; - - html.push_str(&format!( - " <div class=\"post-image\">\n <a href=\"{}\" target=\"_blank\" title=\"{}\">\n <img src=\"{}\" alt=\"\" loading=\"lazy\">\n </a>\n <div class=\"post-image-info\">{} (<a href=\"{}\" target=\"_blank\" class=\"post-image-link\">o</a>, <a href=\"{}\" target=\"_blank\" class=\"post-image-link\">t</a>)</div>\n </div>\n", - html_escape(&href), - html_escape(&file.name_orig), - html_escape(&img_src), - html_escape(&file.name_orig), - html_escape(&file.url), - html_escape(&file.url_thumb), - )); - } - html.push_str(" </div>\n"); - html -} diff --git a/src/export/mod.rs b/src/export/mod.rs @@ -1,8 +0,0 @@ -use crate::{config::Config, events::Reporter, post::Post}; -use anyhow::Result; - -pub mod html; - -pub trait Export { - fn export(&self, posts: &[Post], config: &Config, reporter: &dyn Reporter) -> Result<()>; -} diff --git a/src/http.rs b/src/http.rs @@ -1,35 +0,0 @@ -use anyhow::{Context, Result}; - -use crate::events::{Event, Reporter}; - -/// GET a URL with up to `attempts` retries, reporting each failure via `reporter`. -pub fn fetch_with_retry(url: &str, attempts: u32, reporter: &dyn Reporter) -> Result<String> { - for attempt in 1..=attempts { - match reqwest::blocking::get(url).and_then(|r| r.text()) { - Ok(text) => return Ok(text), - Err(e) => { - reporter.report(Event::FetchRetrying { - url: url.to_string(), - attempt, - max_attempts: attempts, - error: e.to_string(), - }); - if attempt < attempts { - std::thread::sleep(std::time::Duration::from_secs(3)); - } - } - } - } - anyhow::bail!("failed to get thread after {attempts} attempts") -} - -/// Download a single URL and write it to `path`. -pub fn download(url: &str, path: &str) -> Result<()> { - let bytes = reqwest::blocking::get(url) - .with_context(|| format!("HTTP GET failed for {}", url))? - .bytes() - .context("failed to read response body")?; - std::fs::write(path, &bytes) - .with_context(|| format!("failed to write {}", path))?; - Ok(()) -} diff --git a/src/lib.rs b/src/lib.rs @@ -1,9 +0,0 @@ -pub mod config; -pub mod events; -pub mod backend; -pub mod post; -pub mod http; -pub mod export; - -pub use events::{Reporter, NullReporter}; -pub use export::html::HtmlExporter; diff --git a/src/lib/config.rs b/src/lib/config.rs @@ -0,0 +1,12 @@ +use crate::export::ExporterKind; + +#[derive(Clone)] +pub struct Config { + pub url: String, + pub dir: std::path::PathBuf, + pub exporter: ExporterKind, + pub thumb: bool, + pub files: bool, + pub resume: bool, + pub download_retries: u32, +} diff --git a/src/lib/download.rs b/src/lib/download.rs @@ -0,0 +1,25 @@ +use anyhow::{anyhow, Result}; + +/// Downloads a URL, retrying up to `tries` times. +/// +/// # Errors +/// Returns an error if all attempts fail or `tries` is 0. +pub fn download(url: &str, tries: u32) -> Result<reqwest::blocking::Response> { + static CLIENT: std::sync::LazyLock<reqwest::blocking::Client> = + std::sync::LazyLock::new(reqwest::blocking::Client::new); + + for attempt in 0..tries { + if attempt > 0 { + std::thread::sleep(std::time::Duration::from_millis(500 * 2u64.pow(attempt))); + } + let response = CLIENT.get(url).send()?; + if response.status().is_success() { + return Ok(response); + } + if response.status().is_client_error() { + return Err(anyhow!("client error: {}", response.status())); + } + } + + Err(anyhow!("failed to download {} after {} tries", url, tries)) +} diff --git a/src/lib/event.rs b/src/lib/event.rs @@ -0,0 +1,29 @@ +#[derive(Debug, Clone)] +pub enum Event { + // Thread retrieval + GetStarted, + GetDone, + GetFailed { error: String }, + + // Files download + DownloadAllStarted, + DownloadAllDone, + DownloadAllFailed { error: String }, + + // File download + DownloadStarted { index: usize, max_index: usize }, + DownloadDone { index: usize, max_index: usize }, + DownloadSkipped { index: usize, max_index: usize }, + DownloadFailed { url: String, error: String }, + + // Files and thumbnails download + DownloadFilesStarted, + DownloadFilesDone, + DownloadThumbStarted, + DownloadThumbDone, + + // Thread export + ExportStarted, + ExportDone, + ExportFailed { error: String }, +} diff --git a/src/lib/export/html/mod.rs b/src/lib/export/html/mod.rs @@ -0,0 +1,27 @@ +use crate::{config::Config, post::Post}; +use anyhow::Result; +use super::Exporter; + +mod render; + +const TEMPLATE: &str = include_str!("template.html"); + +#[derive(Clone)] +pub struct HtmlExporter; + +impl Exporter for HtmlExporter { + fn export(&self, posts: &[Post], config: &Config) -> Result<()> { + anyhow::ensure!(!posts.is_empty(), "No posts to export"); + + std::fs::create_dir_all(&config.dir)?; + let posts_html = posts + .iter() + .map(|p| render::render_post(p, config.files, config.thumb)) + .collect::<Vec<String>>() + .join("\n"); + let index_html = TEMPLATE.replace("{{posts}}", &posts_html); + std::fs::write(config.dir.join("index.html"), index_html)?; + + Ok(()) + } +} diff --git a/src/lib/export/html/render.rs b/src/lib/export/html/render.rs @@ -0,0 +1,140 @@ +use crate::post::{File, Post}; + +/// Renders a single post to an HTML fragment string. +/// If download_files or download_thumbnails is true, the links will be converted to local paths +pub fn render_post(post: &Post, download_files: bool, download_thumbnails: bool) -> String { + let mut html = format!("<div class=\"post\" id=\"post{}\">\n", post.id); + + html.push_str(" <div class=\"post-head\">\n"); + + // Subject + if let Some(ref subject) = post.subject { + html.push_str(&format!( + " <span class=\"post-subject\">{}</span>\n", + html_escape(subject) + )); + } + + // Name /w mailto/sage + let name = post.name.as_deref().unwrap_or("Аноним"); + let name_display = if let Some(ref mailto) = post.mailto { + format!("[{}] {}", mailto, name) + } else { + name.to_string() + }; + html.push_str(&format!( + " <span class=\"post-name\">{}</span>\n", + html_escape(&name_display) + )); + + // Time, num, id + html.push_str(&format!(" <span class=\"post-time\">{}</span>\n", html_escape(&post.time))); + html.push_str(&format!(" <span class=\"post-num\">{}</span>\n", html_escape(&post.num))); + html.push_str(&format!( + " <span class=\"post-id\"><a href=\"#post{0}\">№{0}</a></span>\n", + post.id + )); + + html.push_str(" </div>\n"); + + // Images + html.push_str(&render_images(&post.files, download_files, download_thumbnails)); + + // Body + html.push_str(" <div class=\"post-body\">\n"); + if !post.text.is_empty() { + html.push_str(" "); + html.push_str(&render_text_to_html(&post.text)); + html.push('\n'); + } + html.push_str(" </div>\n"); + + html.push_str("</div>\n"); + html +} + +fn html_escape(s: &str) -> String { + s.replace('&', "&amp;") + .replace('<', "&lt;") + .replace('>', "&gt;") + .replace('"', "&quot;") +} + +/// Converts plain post text to HTML. +/// - `>>id` → reply link anchor +/// - Lines starting with `>` (not `>>digit`) → greentext span +/// - `\n` → `<br>` +fn render_text_to_html(text: &str) -> String { + let needle = "&gt;&gt;"; + + let lines: Vec<String> = text.split('\n').map(|line| { + let escaped = html_escape(line); + + // Replace >>id with reply link anchors + let mut processed = String::with_capacity(escaped.len()); + let mut rest = escaped.as_str(); + while let Some(pos) = rest.find(needle) { + processed.push_str(&rest[..pos]); + let after = &rest[pos + needle.len()..]; + let digit_end = after.find(|c: char| !c.is_ascii_digit()).unwrap_or(after.len()); + if digit_end > 0 { + let id = &after[..digit_end]; + processed.push_str(&format!("<a href=\"#post{id}\" class=\"reply-link\">&gt;&gt;{id}</a>")); + rest = &after[digit_end..]; + } else { + processed.push_str(needle); + rest = after; + } + } + processed.push_str(rest); + + // Wrap in greentext span if line starts with > but not >>digit + let is_greentext = escaped.starts_with("&gt;") + && !escaped.strip_prefix(needle).is_some_and(|s| s.starts_with(|c: char| c.is_ascii_digit())); + if is_greentext { + format!("<span class=\"quote\">{processed}</span>") + } else { + processed + } + }).collect(); + + lines.join("<br>\n") +} + +fn render_images( + files: &[File], + download_files: bool, + download_thumbnails: bool, +) -> String { + if files.is_empty() { + return String::new(); + } + + let mut html = String::from(" <div class=\"post-images\">\n"); + for file in files { + let href = if download_files && !file.url.is_empty() { + format!("files/{}", file.url.split('/').last().unwrap_or("")) + } else { + file.url.clone() + }; + + let thumb_filename = file.url_thumb.split('/').last().unwrap_or("").to_string(); + let img_src = if download_thumbnails && !file.url_thumb.is_empty() { + format!("thumb/{}", thumb_filename) + } else { + file.url_thumb.clone() + }; + + html.push_str(&format!( + " <div class=\"post-image\">\n <a href=\"{}\" target=\"_blank\" title=\"{}\">\n <img src=\"{}\" alt=\"\" loading=\"lazy\">\n </a>\n <div class=\"post-image-info\">{} (<a href=\"{}\" target=\"_blank\" class=\"post-image-link\">o</a>, <a href=\"{}\" target=\"_blank\" class=\"post-image-link\">t</a>)</div>\n </div>\n", + html_escape(&href), + html_escape(&file.name_orig), + html_escape(&img_src), + html_escape(&file.name_orig), + html_escape(&file.url), + html_escape(&file.url_thumb), + )); + } + html.push_str(" </div>\n"); + html +} diff --git a/src/export/html/template.html b/src/lib/export/html/template.html diff --git a/src/lib/export/mod.rs b/src/lib/export/mod.rs @@ -0,0 +1,35 @@ +pub mod html; + +use super::{config::Config, post::Post}; + +use anyhow::Result; + +use std::str::FromStr; + +#[derive(Clone)] +pub enum ExporterKind { + Html(html::HtmlExporter), +} + +pub trait Exporter { + fn export(&self, posts: &[Post], config: &Config) -> Result<()>; +} + +impl Exporter for ExporterKind { + fn export(&self, posts: &[Post], config: &Config) -> Result<()> { + match self { + ExporterKind::Html(html) => html.export(posts, config), + } + } +} + +impl FromStr for ExporterKind { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result<ExporterKind> { + match s.to_lowercase().as_str() { + "html" => Ok(ExporterKind::Html(html::HtmlExporter {})), + _ => anyhow::bail!("unknown exporter: {}", s), + } + } +} diff --git a/src/lib/lib.rs b/src/lib/lib.rs @@ -0,0 +1,96 @@ +pub mod config; +pub mod event; +pub mod export; + +mod download; +mod post; + +use crate::post::{Post, File}; +use crate::export::Exporter; + +use anyhow::{Result, Context}; + +use std::sync::mpsc::Sender; + +pub const BASE_URL: &str = "https://arhivach.vc"; + +pub fn run(config: &config::Config, tx: Sender<event::Event>) -> Result<()> { + tx.send(event::Event::GetStarted)?; + let html = download::download(&config.url, config.download_retries)?.text()?; + let posts = Post::parse_posts(&html) + .inspect_err(|e| { let _ = tx.send(event::Event::GetFailed { error: format!("{:#}", e) }); }) + .context("failed to parse posts")?; + tx.send(event::Event::GetDone)?; + + tx.send(event::Event::DownloadAllStarted)?; + run_download(&posts, &config, tx.clone()) + .inspect_err(|e| { let _ = tx.send(event::Event::DownloadAllFailed { error: format!("{:#}", e) }); }) + .context("failed to download files")?; + tx.send(event::Event::DownloadAllDone)?; + + tx.send(event::Event::ExportStarted)?; + config.exporter.export(&posts, config) + .inspect_err(|e| { let _ = tx.send(event::Event::ExportFailed { error: format!("{:#}", e) }); }) + .context("failed to export")?; + tx.send(event::Event::ExportDone)?; + + Ok(()) +} + +/// Download files and thumbnails. Send DownloadStarted, DownloadDone and DownloadFailed events +fn run_download(posts: &[Post], config: &config::Config, tx: Sender<event::Event>) -> Result<()> { + std::fs::create_dir_all(&config.dir)?; + + let download_item = |url: &str, filepath: &std::path::PathBuf| -> Result<()> { + let result = download::download(url, config.download_retries)?; + anyhow::ensure!(result.status().is_success(), "failed to download {}: {}", url, result.status()); + let bytes = result.bytes()?; + anyhow::ensure!(!bytes.is_empty(), "empty file: {}", url); + std::fs::write(filepath, bytes)?; + Ok(()) + }; + + let download_section = | + subdir: &str, + get_url: fn(&File) -> (&str, &str), + | -> Result<()> { + let dir = config.dir.join(subdir); + std::fs::create_dir_all(&dir)?; + + let mut index: usize = 1; + let max_index: usize = posts.iter().map(|p| p.files.len()).sum(); + for f in posts.iter().flat_map(|p| &p.files) { + tx.send(event::Event::DownloadStarted { index, max_index })?; + let (url, fallback) = get_url(f); + let filename = url.rsplit("/").next().unwrap_or(fallback).trim(); + let filepath = dir.join(filename); + if config.resume && filepath.exists() { + tx.send(event::Event::DownloadSkipped { index, max_index })?; + index += 1; + continue + } + match download_item(url, &filepath) { + Ok(()) => tx.send(event::Event::DownloadDone{ index, max_index })?, + Err(e) => tx.send(event::Event::DownloadFailed { + url: url.to_string(), + error: format!("{:#}", e) + })? + }; + index += 1; + } + Ok(()) + }; + + if config.files { + tx.send(event::Event::DownloadFilesStarted)?; + download_section("files", |f| (&f.url, &f.name_timestamp))?; + tx.send(event::Event::DownloadFilesDone)?; + } + if config.thumb { + tx.send(event::Event::DownloadThumbStarted)?; + download_section("thumb", |f| (&f.url_thumb, &f.name_timestamp))?; + tx.send(event::Event::DownloadThumbDone)?; + } + + Ok(()) +} diff --git a/src/lib/post.rs b/src/lib/post.rs @@ -0,0 +1,374 @@ +use super::BASE_URL; + +use anyhow::{Context, Result}; + +#[derive(Debug, Clone)] +pub struct File { + /// original name, "videolol.mp4" + pub name_orig: String, + /// timestampname, "17699100670710.mp4" + pub name_timestamp: String, + /// thumbnail url, "https://arhivach.vc/storage/t/aeaa7825f8d8ffe3f07f242a59b7761c.thumb" + pub url_thumb: String, + /// url, "https://i.arhivach.vc/storage/a/ea/aeaa7825f8d8ffe3f07f242a59b7761c.mp4" + pub url: String, +} + +impl std::fmt::Display for File { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{} [{}]\n url: {}\n thumb: {}", + self.name_orig, self.name_timestamp, self.url, self.url_thumb + ) + } +} + +struct PostHead { + subject: Option<String>, + name: Option<String>, + mailto: Option<String>, + time: String, + num: String, + id: u32, +} + +/// Represents a single post in a thread +#[derive(Debug, Clone)] +pub struct Post { + /// Empty if None + pub subject: Option<String>, + /// "Аноним" if none + pub name: Option<String>, + /// "mailto:sage" + pub mailto: Option<String>, + /// "01/02/26 Вск 03:13:12" + pub time: String, + /// "#5" + pub num: String, + /// "329281515" + pub id: u32, + pub files: Vec<File>, + /// Post text + pub text: String, +} + +impl Post { + pub fn parse_posts( + html: &str, + ) -> Result<Vec<Post>> { + let mut posts = Vec::new(); + + let document = scraper::Html::parse_document(html); + let selector = scraper::Selector::parse(r#"div.post"#).unwrap(); + for node in document.select(&selector) { + let post = Post::parse_post(node)?; + posts.push(post); + } + + Ok(posts) + } + + /// Parse div class="post" + /// + /// Example element: + /// ```html + /// <div class="post" id="post329274763" postid="329274763"> + /// <div class="post_head">...</div> (see parse_post_head function) + /// <span class="post_comment">...</span> (see parse_post_comment function) + /// </div> + /// ``` + fn parse_post(node: scraper::ElementRef) -> Result<Post> { + static SEL_POST_HEAD: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("div.post_head").unwrap() + ); + static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("span.post_comment").unwrap() + ); + + let post_head = node + .select(&SEL_POST_HEAD) + .next() + .context("missing post_head")?; + let head = Post::parse_post_head(post_head)?; + + let post_comment = node + .select(&SEL_POST_IMAGE_BLOCK) + .next() + .context("missing post_comment")?; + let (files, text) = Post::parse_post_comment(post_comment)?; + + Ok(Post { + subject: head.subject, + name: head.name, + mailto: head.mailto, + time: head.time, + num: head.num, + id: head.id, + files, + text, + }) + } + + /// Parses the post_head element + /// + /// Returns (subject, name, mailto, time, num, id) + /// Returns error if no time, num or id is found or if id is not a number + /// + /// Example element: + /// ```html + /// <div class="post_head"> + /// <span class="poster_name" title="">Аноним</span>&nbsp; + /// <span class="post_time">01/02/26 Вск 04:27:32</span>&nbsp; + /// <span class="post_num">#77</span>&nbsp; + /// <span class="post_id"> + /// <a style="position:absolute;margin-top:-50px;" id="329274763"></a> + /// <a href="#329274763">№329274763</a> + /// </span> &nbsp; + /// </div> + /// ``` + fn parse_post_head(post_head: scraper::ElementRef) -> Result<PostHead> { + static SEL_SPAN_POST_ID_A_HREF: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("span.post_id a[href]").unwrap() + ); + static SEL_H1_POST_SUBJECT: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("h1.post_subject").unwrap() + ); + static SEL_SPAN_POSTER_NAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("span.poster_name").unwrap() + ); + static SEL_A_POST_MAIL: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("a.post_mail").unwrap() + ); + static SEL_SPAN_POST_TIME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("span.post_time").unwrap() + ); + static SEL_SPAN_POST_NUM: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("span.post_num").unwrap() + ); + + let id: u32 = post_head + .select(&SEL_SPAN_POST_ID_A_HREF) + .next() + .and_then(|el| el.value().attr("href")) + .and_then(|href| href.strip_prefix('#')) + .context("missing post id")? + .parse()?; + + let subject = post_head + .select(&SEL_H1_POST_SUBJECT) + .next() + .map(|el| el.text().collect::<String>()); + + let name = post_head + .select(&SEL_SPAN_POSTER_NAME) + .next() + .map(|el| el.text().collect::<String>()) + .and_then(|n| if n == "Аноним" { None } else { Some(n) }); + + let mailto = post_head + .select(&SEL_A_POST_MAIL) + .next() + .and_then(|el| el.value().attr("title")) + .map(|s| s.to_string()); + + let time = post_head + .select(&SEL_SPAN_POST_TIME) + .next() + .context("missing post_time")? + .text() + .collect::<String>(); + + let num = post_head + .select(&SEL_SPAN_POST_NUM) + .next() + .context("missing post_num")? + .text() + .collect::<String>(); + + Ok(PostHead { subject, name, mailto, time, num, id }) + } + + /// Parses the sapn post_comment element from a post element + /// + /// Returns (files, text) + /// + /// Example element: + /// <span class="post_comment"> + /// <div class="post_image_block" ...>...</div> (see parse_post_image_block function) (can appear 0 to multiple times) + /// <div class="post_comment_body">...</div> (see parse_post_comment_body function) + /// </span> + fn parse_post_comment( + node: scraper::ElementRef, + ) -> Result<(Vec<File>, String)> { + static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("div.post_image_block").unwrap() + ); + static SEL_POST_COMMENT_BODY: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("div.post_comment_body").unwrap() + ); + + // TODO handle the errors instead of propagating them upper. Change the return type to non-Result + let files: Vec<File> = node + .select(&SEL_POST_IMAGE_BLOCK) + .map(Post::parse_post_image_block) + .collect(); + let text = Post::parse_post_comment_body(node + .select(&SEL_POST_COMMENT_BODY) + .next() + .context("missing post_comment_body")?); + Ok((files, text)) + } + + /// Parses "post_image_block" element + /// Returns File + /// + /// Example element: + /// ```html + /// <div class="post_image_block" id="pib_77_2" pib="77_2" title="537.4 Кб, 946 x 946 + /// image.png + /// 17699092523481.png"> + /// <a class="expand_image" onclick="expand_local('77_2','/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png','946','946',event); return false;" href="#"> + /// <div class="post_image" id="thumb_77_2"> + /// <img src="/storage/t/acc7f5856bc60ad3bdbd4dc7027e33f9.png" alt="" loading="lazy"> // thumbnail path + /// </div> + /// </a> + /// <a href="/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png" target="_blank" class="img_filename">image.png</a> // can also be https://i.arhivach.vc/... if it's a video + /// </div> + /// ``` + fn parse_post_image_block(pib: scraper::ElementRef) -> File { + static SEL_POST_IMAGE_IMG: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse(".post_image img").unwrap() + ); + static SEL_A_IMG_FILENAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( + || scraper::Selector::parse("a.img_filename").unwrap() + ); + + // Title example: + // 402.2 Кб, 800 x 532 + // image.png <- name_orig + // 17699142349880.png <- name_timestamp + let title = pib.value().attr("title").unwrap_or(""); + let title_lines: Vec<&str> = title.lines().collect(); + let name_orig = title_lines + .get(1) + .map(|s| s.to_string()) + .unwrap_or("unnamed".to_string()); + let name_timestamp = title_lines + .get(2) + .map(|s| s.to_string()) + .unwrap_or("unnamed".to_string()); + + // url_thumb + let url_thumb = pib + .select(&SEL_POST_IMAGE_IMG) + .next() + .and_then(|el| el.value().attr("src")) + .unwrap_or(""); // /storage/t/83c2fe5ba9a8469d9eeef4af124e3b52.thumb + let url_thumb = if url_thumb.is_empty() { + String::new() + } else { + format!("{BASE_URL}{url_thumb}") + }; + + // url + let url = pib + .select(&SEL_A_IMG_FILENAME) + .next() + .and_then(|el| el.value().attr("href")) + .unwrap_or(""); + let url = if url.starts_with("http") { // is `https://i.arhivach.vc/...`? + url.to_string() + } else if url.is_empty() { + String::new() + } else { + format!("{BASE_URL}{url}") + }; + + File { + name_orig, + name_timestamp, + url_thumb, + url, + } + } + + /// Parses the post text from `div.post_comment_body` + /// + /// Returns post text: + /// - References are plaintext (e.g. >>329274789) + /// - `<br>` is replaced with \n + /// - `<span class="unkfunc">` (greentext) is replaced with >text + /// + /// If the text contains a reference (e.g. >>329274789) it looks like this in the element: + /// ```html + /// <div class="post_comment_body"> + /// <a href="#329274893" class="post-reply-link" data-thread="329273515" data-num="329274893">&gt;&gt;329274893</a> // This will be replaced with >>329274893 + /// <br> + /// <span class="unkfunc">&gt;greentext1</span> + /// <br> + /// text1 + /// </div> + /// ``` + /// + /// This example returns: + /// ```text + /// >>329274893 + /// >greentext1 + /// text1 + /// ``` + fn parse_post_comment_body(node: scraper::ElementRef) -> String { + use scraper::node::Node; + + let mut result = String::new(); + for child in node.children() { + match child.value() { + Node::Text(text) => result.push_str(&text.text), + Node::Element(el) if el.name() == "br" => result.push('\n'), + Node::Element(_) => { + if let Some(el_ref) = scraper::ElementRef::wrap(child) { + result.push_str(&el_ref.text().collect::<String>()); + } + } + _ => {} + } + } + result.trim().to_string() + } +} + +impl std::fmt::Display for Post { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // Header line + let name = self.name.as_deref().unwrap_or("Аноним"); + let mailto = self.mailto.as_deref().unwrap_or(""); + + if !mailto.is_empty() { + write!(f, "{} ({})", name, mailto)?; + } else { + write!(f, "{}", name)?; + } + + write!(f, " {} {} ID:{}", self.time, self.num, self.id)?; + + // Subject + if let Some(ref subject) = self.subject { + write!(f, "\n{}", subject)?; + } + + // Files + if !self.files.is_empty() { + write!(f, "\n[Files: {}]", self.files.len())?; + for file in &self.files { + write!(f, "\n - {}", file)?; + } + } + + // Post text + if !self.text.is_empty() { + write!(f, "\n{}", self.text)?; + } + + Ok(()) + } +} diff --git a/src/post.rs b/src/post.rs @@ -1,374 +0,0 @@ -use anyhow::{Context, Result}; - -const BASE_URL: &str = "https://arhivach.vc"; - -#[derive(Debug, Clone)] -pub struct File { - /// original name, "videolol.mp4" - pub name_orig: String, - /// timestampname, "17699100670710.mp4" - pub name_timestamp: String, - /// thumbnail url, "https://arhivach.vc/storage/t/aeaa7825f8d8ffe3f07f242a59b7761c.thumb" - pub url_thumb: String, - /// url, "https://i.arhivach.vc/storage/a/ea/aeaa7825f8d8ffe3f07f242a59b7761c.mp4" - pub url: String, -} - -impl std::fmt::Display for File { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{} [{}]\n url: {}\n thumb: {}", - self.name_orig, self.name_timestamp, self.url, self.url_thumb - ) - } -} - -struct PostHead { - subject: Option<String>, - name: Option<String>, - mailto: Option<String>, - time: String, - num: String, - id: u32, -} - -/// Represents a single post in a thread -#[derive(Debug, Clone)] -pub struct Post { - /// Empty if None - pub subject: Option<String>, - /// "Аноним" if none - pub name: Option<String>, - /// "mailto:sage" - pub mailto: Option<String>, - /// "01/02/26 Вск 03:13:12" - pub time: String, - /// "#5" - pub num: String, - /// "329281515" - pub id: u32, - pub files: Vec<File>, - /// Post text - pub text: String, -} - -impl Post { - pub fn parse_posts( - html: &str, - ) -> Result<Vec<Post>> { - let mut posts = Vec::new(); - - let document = scraper::Html::parse_document(html); - let selector = scraper::Selector::parse(r#"div.post"#).unwrap(); - for node in document.select(&selector) { - let post = Post::parse_post(node)?; - posts.push(post); - } - - Ok(posts) - } - - /// Parse div class="post" - /// - /// Example element: - /// ```html - /// <div class="post" id="post329274763" postid="329274763"> - /// <div class="post_head">...</div> (see parse_post_head function) - /// <span class="post_comment">...</span> (see parse_post_comment function) - /// </div> - /// ``` - fn parse_post(node: scraper::ElementRef) -> Result<Post> { - static SEL_POST_HEAD: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("div.post_head").unwrap() - ); - static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("span.post_comment").unwrap() - ); - - let post_head = node - .select(&SEL_POST_HEAD) - .next() - .context("missing post_head")?; - let head = Post::parse_post_head(post_head)?; - - let post_comment = node - .select(&SEL_POST_IMAGE_BLOCK) - .next() - .context("missing post_comment")?; - let (files, text) = Post::parse_post_comment(post_comment)?; - - Ok(Post { - subject: head.subject, - name: head.name, - mailto: head.mailto, - time: head.time, - num: head.num, - id: head.id, - files, - text, - }) - } - - /// Parses the post_head element - /// - /// Returns (subject, name, mailto, time, num, id) - /// Returns error if no time, num or id is found or if id is not a number - /// - /// Example element: - /// ```html - /// <div class="post_head"> - /// <span class="poster_name" title="">Аноним</span>&nbsp; - /// <span class="post_time">01/02/26 Вск 04:27:32</span>&nbsp; - /// <span class="post_num">#77</span>&nbsp; - /// <span class="post_id"> - /// <a style="position:absolute;margin-top:-50px;" id="329274763"></a> - /// <a href="#329274763">№329274763</a> - /// </span> &nbsp; - /// </div> - /// ``` - fn parse_post_head(post_head: scraper::ElementRef) -> Result<PostHead> { - static SEL_SPAN_POST_ID_A_HREF: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("span.post_id a[href]").unwrap() - ); - static SEL_H1_POST_SUBJECT: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("h1.post_subject").unwrap() - ); - static SEL_SPAN_POSTER_NAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("span.poster_name").unwrap() - ); - static SEL_A_POST_MAIL: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("a.post_mail").unwrap() - ); - static SEL_SPAN_POST_TIME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("span.post_time").unwrap() - ); - static SEL_SPAN_POST_NUM: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("span.post_num").unwrap() - ); - - let id: u32 = post_head - .select(&SEL_SPAN_POST_ID_A_HREF) - .next() - .and_then(|el| el.value().attr("href")) - .and_then(|href| href.strip_prefix('#')) - .context("missing post id")? - .parse()?; - - let subject = post_head - .select(&SEL_H1_POST_SUBJECT) - .next() - .map(|el| el.text().collect::<String>()); - - let name = post_head - .select(&SEL_SPAN_POSTER_NAME) - .next() - .map(|el| el.text().collect::<String>()) - .and_then(|n| if n == "Аноним" { None } else { Some(n) }); - - let mailto = post_head - .select(&SEL_A_POST_MAIL) - .next() - .and_then(|el| el.value().attr("title")) - .map(|s| s.to_string()); - - let time = post_head - .select(&SEL_SPAN_POST_TIME) - .next() - .context("missing post_time")? - .text() - .collect::<String>(); - - let num = post_head - .select(&SEL_SPAN_POST_NUM) - .next() - .context("missing post_num")? - .text() - .collect::<String>(); - - Ok(PostHead { subject, name, mailto, time, num, id }) - } - - /// Parses the sapn post_comment element from a post element - /// - /// Returns (files, text) - /// - /// Example element: - /// <span class="post_comment"> - /// <div class="post_image_block" ...>...</div> (see parse_post_image_block function) (can appear 0 to multiple times) - /// <div class="post_comment_body">...</div> (see parse_post_comment_body function) - /// </span> - fn parse_post_comment( - node: scraper::ElementRef, - ) -> Result<(Vec<File>, String)> { - static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("div.post_image_block").unwrap() - ); - static SEL_POST_COMMENT_BODY: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("div.post_comment_body").unwrap() - ); - - // TODO handle the errors instead of propagating them upper. Change the return type to non-Result - let files: Vec<File> = node - .select(&SEL_POST_IMAGE_BLOCK) - .map(Post::parse_post_image_block) - .collect(); - let text = Post::parse_post_comment_body(node - .select(&SEL_POST_COMMENT_BODY) - .next() - .context("missing post_comment_body")?); - Ok((files, text)) - } - - /// Parses "post_image_block" element - /// Returns File - /// - /// Example element: - /// ```html - /// <div class="post_image_block" id="pib_77_2" pib="77_2" title="537.4 Кб, 946 x 946 - /// image.png - /// 17699092523481.png"> - /// <a class="expand_image" onclick="expand_local('77_2','/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png','946','946',event); return false;" href="#"> - /// <div class="post_image" id="thumb_77_2"> - /// <img src="/storage/t/acc7f5856bc60ad3bdbd4dc7027e33f9.png" alt="" loading="lazy"> // thumbnail path - /// </div> - /// </a> - /// <a href="/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png" target="_blank" class="img_filename">image.png</a> // can also be https://i.arhivach.vc/... if it's a video - /// </div> - /// ``` - fn parse_post_image_block(pib: scraper::ElementRef) -> File { - static SEL_POST_IMAGE_IMG: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse(".post_image img").unwrap() - ); - static SEL_A_IMG_FILENAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( - || scraper::Selector::parse("a.img_filename").unwrap() - ); - - // Title example: - // 402.2 Кб, 800 x 532 - // image.png <- name_orig - // 17699142349880.png <- name_timestamp - let title = pib.value().attr("title").unwrap_or(""); - let title_lines: Vec<&str> = title.lines().collect(); - let name_orig = title_lines - .get(1) - .map(|s| s.to_string()) - .unwrap_or("unnamed".to_string()); - let name_timestamp = title_lines - .get(2) - .map(|s| s.to_string()) - .unwrap_or("unnamed".to_string()); - - // url_thumb - let url_thumb = pib - .select(&SEL_POST_IMAGE_IMG) - .next() - .and_then(|el| el.value().attr("src")) - .unwrap_or(""); // /storage/t/83c2fe5ba9a8469d9eeef4af124e3b52.thumb - let url_thumb = if url_thumb.is_empty() { - String::new() - } else { - format!("{BASE_URL}{url_thumb}") - }; - - // url - let url = pib - .select(&SEL_A_IMG_FILENAME) - .next() - .and_then(|el| el.value().attr("href")) - .unwrap_or(""); - let url = if url.starts_with("http") { // is `https://i.arhivach.vc/...`? - url.to_string() - } else if url.is_empty() { - String::new() - } else { - format!("{BASE_URL}{url}") - }; - - File { - name_orig, - name_timestamp, - url_thumb, - url, - } - } - - /// Parses the post text from `div.post_comment_body` - /// - /// Returns post text: - /// - References are plaintext (e.g. >>329274789) - /// - `<br>` is replaced with \n - /// - `<span class="unkfunc">` (greentext) is replaced with >text - /// - /// If the text contains a reference (e.g. >>329274789) it looks like this in the element: - /// ```html - /// <div class="post_comment_body"> - /// <a href="#329274893" class="post-reply-link" data-thread="329273515" data-num="329274893">&gt;&gt;329274893</a> // This will be replaced with >>329274893 - /// <br> - /// <span class="unkfunc">&gt;greentext1</span> - /// <br> - /// text1 - /// </div> - /// ``` - /// - /// This example returns: - /// ```text - /// >>329274893 - /// >greentext1 - /// text1 - /// ``` - fn parse_post_comment_body(node: scraper::ElementRef) -> String { - use scraper::node::Node; - - let mut result = String::new(); - for child in node.children() { - match child.value() { - Node::Text(text) => result.push_str(&text.text), - Node::Element(el) if el.name() == "br" => result.push('\n'), - Node::Element(_) => { - if let Some(el_ref) = scraper::ElementRef::wrap(child) { - result.push_str(&el_ref.text().collect::<String>()); - } - } - _ => {} - } - } - result.trim().to_string() - } -} - -impl std::fmt::Display for Post { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // Header line - let name = self.name.as_deref().unwrap_or("Аноним"); - let mailto = self.mailto.as_deref().unwrap_or(""); - - if !mailto.is_empty() { - write!(f, "{} ({})", name, mailto)?; - } else { - write!(f, "{}", name)?; - } - - write!(f, " {} {} ID:{}", self.time, self.num, self.id)?; - - // Subject - if let Some(ref subject) = self.subject { - write!(f, "\n{}", subject)?; - } - - // Files - if !self.files.is_empty() { - write!(f, "\n[Files: {}]", self.files.len())?; - for file in &self.files { - write!(f, "\n - {}", file)?; - } - } - - // Post text - if !self.text.is_empty() { - write!(f, "\n{}", self.text)?; - } - - Ok(()) - } -}