commit 766cc139ec89a1ac4d6c4da3bbbb5398fe493856
parent 6815f58962d8337ca8590c4af9218ded1ff11752
Author: egor-achkasov <eaachkasov@gmail.com>
Date: Mon, 9 Mar 2026 01:02:45 +0000
Complete refactor to separte front and back; ecapsulate export
Diffstat:
21 files changed, 833 insertions(+), 911 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,6 +4,9 @@ description = "Download threads from arhivach."
version = "0.1.0"
edition = "2024"
+[lib]
+path = "src/lib/lib.rs"
+
[[bin]]
name = "arhivach-downloader-cli"
path = "src/bin/cli/main.rs"
diff --git a/README.md b/README.md
@@ -9,28 +9,28 @@ Download threads from arhivach.vc and save them locally for offline access or pr
`arhivach-downloader --help`:
```
-Download threads from arhivach.
-
-Usage: arhivarch-downloader.exe [OPTIONS] [URL]
+Usage: arhivarch-downloader-cli.exe [OPTIONS] <URL>
Arguments:
- [URL] URL to download
+ <URL> URL to download
Options:
- -l, --list <LIST> Path to a text file containing a list of URLs (one per line)
- -t, --thumb Download thumbnail images, default: false
- -f, --files Download files (images, videos, gifs, etc), default: false
- -r, --resume Resume files and thumbnails downloading instead of overwriting. Useless if neither -t nor -f are set, default: false
- -h, --help Print help
+ -d, --dir <DIR> Path to download directory [default: .]
+ -e, --exporter <EXPORTER> Exporter [default: html] [possible values: html]
+ -t, --thumb Download thumbnail images, default: false
+ -f, --files Download files (images, videos, gifs, etc), default: false
+ -r, --resume Resume files and thumbnails downloading instead of overwriting. Useless if neither -t nor -f are set, default: false
+ -R, --retries <RETRIES> Download retries in case of a error [default: 3]
+ -h, --help Print help
```
-Each thread will be downloaded in a directory named by an OP №. Contents:
-- index.html -- the thread. Open it with your web browser.
-- files directory (if -f (--files) argument is given) -- all the files original attached to posts. Might be heavy if there are many videos.
-- thumb directory (if -t (--thumb) argument is given) -- all the thumbnails needed to render file previews in the thread.
+Creates a subdirectory named after the arhivach thread id (the number after `/thread/` in the URL) inside the download directory, and saves the thread there. Contents:
+- `index.html` — the thread. Open it with your web browser.
+- `files/` (if `-f`/`--files` is given) — original files attached to posts. May be large if there are many videos.
+- `thumb/` (if `-t`/`--thumb` is given) — thumbnails needed to render file previews in the thread.
-Main index.html will be created in the current directory to feature the first posts of the downloaded threads.
+Use `-r`/`--resume` to skip files and thumbnails that are already downloaded.
-Note that you may pass an URL directly as an argument, pass a path to a text file with URLs via -f, or both.
+Use `-d`/`--dir` to specify where to create the thread directory (defaults to the current directory).
-Use -r (--resume) to skip downloading files and thumbnails that are already there.
+Use `-R`/`--retries` to control how many times a failed download is retried (default: 3).
diff --git a/src/backend.rs b/src/backend.rs
@@ -1,59 +0,0 @@
-use anyhow::{Context, Ok, Result};
-use std::result::Result::Ok as StdOk;
-
-use crate::{config::Config, events::{Event, Reporter}, export::{Export, html}, http, post::Post};
-
-pub fn scrape_thread(url: &str, config: &Config, reporter: &dyn Reporter, exporter: &dyn Export) -> Result<Post> {
- let t_total = std::time::Instant::now();
-
- reporter.report(Event::FetchStarted { url: url.to_string() });
- let t = std::time::Instant::now();
- let html_content = http::fetch_with_retry(url, 3, reporter)?;
- reporter.report(Event::FetchDone { elapsed_ms: t.elapsed().as_millis() });
-
- reporter.report(Event::ParseStarted);
- let t = std::time::Instant::now();
- let posts = Post::parse_posts(&html_content).context("failed to parse thread HTML")?;
- reporter.report(Event::ParseDone {
- post_count: posts.len(),
- elapsed_ms: t.elapsed().as_millis(),
- });
-
- let first_post = posts.first().context("thread has no posts")?.clone();
-
- exporter.export(&posts, config, reporter).context("failed to export thread")?;
-
- reporter.report(Event::ThreadDone {
- url: url.to_string(),
- elapsed_ms: t_total.elapsed().as_millis(),
- });
-
- Ok(first_post)
-}
-
-pub fn run(config: &Config, reporter: &dyn Reporter, exporter: &dyn Export) -> Result<()> {
- let total = config.urls.len();
- let mut first_posts: Vec<Post> = Vec::new();
-
- for (i, url) in config.urls.iter().enumerate() {
- reporter.report(Event::ThreadStarted {
- url: url.clone(),
- index: i + 1,
- total,
- });
-
- match scrape_thread(url, config, reporter, exporter) {
- StdOk(first_post) => first_posts.push(first_post),
- Err(e) => {
- reporter.report(Event::ThreadFailed {
- url: url.clone(),
- error: format!("{:#}", e),
- });
- }
- }
- }
-
- html::write_index_html(&first_posts, config).context("failed to write main index.html")?;
-
- Ok(())
-}
diff --git a/src/bin/cli/main.rs b/src/bin/cli/main.rs
@@ -1,41 +1,47 @@
-use arhivarch_downloader::{backend, events::Event, config::Config, HtmlExporter};
+use arhivarch_downloader::config::Config;
+use arhivarch_downloader::event::Event;
+use arhivarch_downloader::export::{html::HtmlExporter, ExporterKind};
-use clap::Parser;
-use anyhow::Result;
+use clap::{Parser, ValueEnum};
use std::path::PathBuf;
-use std::sync::mpsc;
-fn main() -> anyhow::Result<()> {
- let config = parse_args().unwrap_or_else(|e| {
- eprintln!("Error: {}", e);
- std::process::exit(1);
- });
-
- let (tx, rx) = mpsc::channel::<Event>();
+#[derive(Clone, ValueEnum)]
+enum ExporterArg {
+ Html,
+}
+use std::sync::mpsc::channel;
+fn main() -> anyhow::Result<()> {
+ let config = parse_args();
+ let (tx, rx) = channel::<Event>();
let handle = std::thread::spawn({
let config = config.clone();
- move || backend::run(&config, &tx, &HtmlExporter)
+ move || arhivarch_downloader::run(&config, tx)
});
for event in rx {
render_event(&event);
}
- handle.join().unwrap()
+ let _ = handle.join().map_err(|e| anyhow::anyhow!("{:?}", e))?;
+ Ok(())
}
-pub fn parse_args() -> Result<Config> {
+pub fn parse_args() -> Config {
#[derive(Parser)]
#[command(about, long_about)]
struct Cli {
/// URL to download
- url: Option<String>,
+ url: String,
+
+ /// Path to download directory
+ #[arg(short = 'd', long = "dir", value_name = "DIR", default_value = ".", value_hint = clap::ValueHint::DirPath)]
+ dir: PathBuf,
- /// Path to a text file containing a list of URLs (one per line)
- #[arg(short = 'l', long = "list")]
- list: Option<PathBuf>,
+ /// Exporter
+ #[arg(short = 'e', long = "exporter", value_name = "EXPORTER", default_value = "html")]
+ exporter: ExporterArg,
/// Download thumbnail images, default: false
#[arg(short = 't', long = "thumb", default_value_t = false)]
@@ -47,85 +53,78 @@ pub fn parse_args() -> Result<Config> {
/// Resume files and thumbnails downloading instead of overwriting. Useless if neither -t nor -f are set, default: false
#[arg(short = 'r', long = "resume", default_value_t = false)]
- resume: bool
- }
- let cli = Cli::parse();
+ resume: bool,
- let mut urls = Vec::new();
- // [URL]
- if let Some(url) = cli.url {
- urls.push(url);
- }
- // [List]
- if let Some(list) = cli.list {
- for line in std::fs::read_to_string(list)?.lines() {
- urls.push(line.to_string());
- }
- }
- if urls.is_empty() {
- anyhow::bail!("No URLs provided");
+ /// Download retries in case of a error
+ #[arg(short = 'R', long = "retries", default_value_t = 3)]
+ download_retries: u32,
}
+ let cli = Cli::parse();
- Ok(Config {
- urls,
+ Config {
+ url: cli.url,
+ dir: cli.dir,
+ exporter: match cli.exporter {
+ ExporterArg::Html => ExporterKind::Html(HtmlExporter),
+ },
thumb: cli.thumb,
files: cli.files,
resume: cli.resume,
- })
+ download_retries: cli.download_retries,
+ }
}
fn render_event(event: &Event) {
use std::io::Write;
match event {
- Event::ThreadStarted { url, index, total } =>
- println!("Processing {} ({} / {}):", url, index, total),
-
- Event::ThreadDone { url, elapsed_ms } =>
- println!("Done processing {} ({} ms)", url, elapsed_ms),
-
- Event::ThreadFailed { url, error } =>
- eprintln!("Error processing {}: {}", url, error),
-
- Event::FetchStarted { .. } => {
- print!("\tGetting thread...");
+ Event::GetStarted => {
+ print!("Fetching thread...");
std::io::stdout().flush().ok();
}
-
- Event::FetchDone { elapsed_ms } =>
- println!(" Done ({} ms)", elapsed_ms),
-
- Event::FetchRetrying { url, attempt, max_attempts, error } => {
- eprintln!("\n\tHTTP request failed for {}: {}", url, error);
- if attempt < max_attempts {
- eprintln!("\tWaiting 3 seconds...");
- }
+ Event::GetDone =>
+ println!(" Done."),
+ Event::GetFailed { error } =>
+ eprintln!("\nFailed to fetch thread: {}", error),
+
+ Event::DownloadAllStarted =>
+ println!("Downloading stuff..."),
+ Event::DownloadAllDone =>
+ println!("All downloads complete."),
+ Event::DownloadAllFailed { error } =>
+ eprintln!("Download failed: {}", error),
+
+ Event::DownloadStarted { index, max_index } => {
+ print!("\r\tDownloading {} / {}...", index, max_index);
+ std::io::stdout().flush().ok();
}
-
- Event::ParseStarted => {
- print!("\tParsing posts...");
+ Event::DownloadDone { index, max_index } => {
+ println!("\r\tDownloading {} / {}... Done.", index, max_index);
+ }
+ Event::DownloadFailed { url, error } =>
+ eprintln!("\r\tFailed to download {}: {}", url, error),
+ Event::DownloadSkipped { index, max_index } =>
+ println!("\r\tDownloading {} / {}... Skipped.", index, max_index),
+
+ Event::DownloadFilesStarted => {
+ println!("Downloading files...");
std::io::stdout().flush().ok();
}
-
- Event::ParseDone { elapsed_ms, .. } =>
- println!(" Done ({} ms)", elapsed_ms),
-
- Event::DownloadBatchStarted { label, total_posts } => {
- print!("\tDownloading {}... post 0 / {}", label, total_posts);
+ Event::DownloadFilesDone =>
+ println!("Done."),
+ Event::DownloadThumbStarted => {
+ println!("Downloading thumbnails...");
std::io::stdout().flush().ok();
}
+ Event::DownloadThumbDone =>
+ println!("Done."),
- Event::DownloadBatchProgress { label, done, total } => {
- print!("\r\tDownloading {}... post {} / {}", label, done, total);
+ Event::ExportStarted => {
+ print!("Exporting...");
std::io::stdout().flush().ok();
}
-
- Event::DownloadAssetFailed { label, filename, error, .. } =>
- println!("\r\tFailed to download {} {}: {}\n\t-> Waiting 3 seconds...", label, filename, error),
-
- Event::DownloadAssetSkipped { label, filename } =>
- println!("\tSkipping {} {} after 3 failed attempts.", label, filename),
-
- Event::DownloadBatchDone { elapsed_ms, .. } =>
- println!(" Done ({} ms)", elapsed_ms),
+ Event::ExportDone =>
+ println!(" Done."),
+ Event::ExportFailed { error } =>
+ eprintln!("\nExport failed: {}", error),
}
}
diff --git a/src/config.rs b/src/config.rs
@@ -1,7 +0,0 @@
-#[derive(Debug, Clone)]
-pub struct Config {
- pub urls: Vec<String>,
- pub thumb: bool,
- pub files: bool,
- pub resume: bool,
-}
diff --git a/src/events.rs b/src/events.rs
@@ -1,45 +0,0 @@
-#[derive(Debug, Clone)]
-pub enum Event {
- // Thread-level lifecycle
- ThreadStarted { url: String, index: usize, total: usize },
- ThreadDone { url: String, elapsed_ms: u128 },
- ThreadFailed { url: String, error: String },
-
- // HTTP fetch
- FetchStarted { url: String },
- FetchDone { elapsed_ms: u128 },
- FetchRetrying { url: String, attempt: u32, max_attempts: u32, error: String },
-
- // HTML parsing
- ParseStarted,
- ParseDone { post_count: usize, elapsed_ms: u128 },
-
- // Asset downloading
- DownloadBatchStarted { label: String, total_posts: usize },
- DownloadBatchProgress { label: String, done: usize, total: usize },
- DownloadAssetFailed { label: String, filename: String, attempt: u32, error: String },
- DownloadAssetSkipped { label: String, filename: String },
- DownloadBatchDone { label: String, elapsed_ms: u128 },
-}
-
-use std::sync::mpsc;
-
-/// Sink for progress events emitted by the library.
-/// Implement this to connect the library to any frontend.
-pub trait Reporter: Send + Sync {
- fn report(&self, event: Event);
-}
-
-/// Blanket impl: mpsc::Sender<Event> is already a valid Reporter.
-impl Reporter for mpsc::Sender<Event> {
- fn report(&self, event: Event) {
- self.send(event).ok();
- }
-}
-
-/// No-op reporter — useful in tests or when progress output is not needed.
-pub struct NullReporter;
-
-impl Reporter for NullReporter {
- fn report(&self, _event: Event) {}
-}
diff --git a/src/export/html/mod.rs b/src/export/html/mod.rs
@@ -1,142 +0,0 @@
-use crate::{config::Config, events::{Event, Reporter}, http, post::{File, Post}};
-use anyhow::{Result, Context};
-use super::Export;
-
-mod render;
-
-const TEMPLATE: &str = include_str!("template.html");
-
-pub struct HtmlExporter;
-
-impl Export for HtmlExporter {
- fn export(&self, posts: &[Post], config: &Config, reporter: &dyn Reporter) -> Result<()> {
- if posts.is_empty() {
- anyhow::bail!("No posts to export");
- }
-
- let dir = format!("{}", posts[0].id);
- std::fs::create_dir_all(&dir)?;
-
- let posts_html: String = posts
- .iter()
- .map(|p| render::render_post(p, config.files, config.thumb))
- .collect::<Vec<String>>()
- .join("\n");
-
- if config.files {
- download_assets(
- &posts,
- &format!("{}/files", dir),
- "files",
- |f| &f.url,
- config.resume,
- reporter,
- )?;
- }
- if config.thumb {
- download_assets(
- &posts,
- &format!("{}/thumb", dir),
- "thumbnails",
- |f| &f.url_thumb,
- config.resume,
- reporter,
- )?;
- }
-
- let index_html = TEMPLATE.replace("{{posts}}", &posts_html);
- std::fs::write(format!("{}/index.html", dir), index_html)?;
-
- Ok(())
- }
-}
-
-/// Write a top-level index.html with one entry per thread (first post + link to thread folder)
-pub fn write_index_html(first_posts: &[Post], config: &Config) -> Result<()> {
- if first_posts.is_empty() {
- return Ok(());
- }
-
- let posts_html: String = first_posts
- .iter()
- .map(|p| {
- let mut post_html = render::render_post(p, config.files, config.thumb);
- config.files.then(|| post_html = post_html.replace(
- "<a href=\"files/",
- &format!("<a href=\"{}/files/", p.id),
- ));
- config.thumb.then(|| post_html = post_html.replace(
- "<img src=\"thumb/",
- &format!("<img src=\"{}/thumb/", p.id),
- ));
- format!("<div><a href=\"{}/index.html\">В тред →</a></div>{}\n", p.id, post_html)
- })
- .collect::<Vec<String>>()
- .join("\n");
-
- let index_html = TEMPLATE.replace("{{posts}}", &posts_html);
- std::fs::write("index.html", index_html)
- .context("failed to write index.html")?;
-
- Ok(())
-}
-
-fn download_assets(
- posts: &[Post],
- dest_dir: &str,
- label: &str,
- url_of: impl Fn(&File) -> &str,
- skip_if_exists: bool,
- reporter: &dyn Reporter,
-) -> Result<()> {
- std::fs::create_dir_all(dest_dir)
- .with_context(|| format!("Failed to create directory {}", dest_dir))?;
-
- let t = std::time::Instant::now();
- reporter.report(Event::DownloadBatchStarted {
- label: label.to_string(),
- total_posts: posts.len(),
- });
-
- for (i, post) in posts.iter().enumerate() {
- for f in &post.files {
- let url = url_of(f);
- let filename = url.split('/').last().unwrap_or("").to_string();
- let path = format!("{}/{}", dest_dir, filename);
- if skip_if_exists && std::path::Path::new(&path).exists() {
- continue;
- }
- let mut result = Err(anyhow::anyhow!("no attempts"));
- for attempt in 1..=3u32 {
- result = http::download(url, &path);
- if result.is_ok() { break; }
- let e = result.as_ref().unwrap_err();
- reporter.report(Event::DownloadAssetFailed {
- label: label.to_string(),
- filename: filename.clone(),
- attempt,
- error: e.to_string(),
- });
- std::thread::sleep(std::time::Duration::from_secs(3));
- }
- if result.is_err() {
- reporter.report(Event::DownloadAssetSkipped {
- label: label.to_string(),
- filename: filename.clone(),
- });
- }
- }
- reporter.report(Event::DownloadBatchProgress {
- label: label.to_string(),
- done: i + 1,
- total: posts.len(),
- });
- }
-
- reporter.report(Event::DownloadBatchDone {
- label: label.to_string(),
- elapsed_ms: t.elapsed().as_millis(),
- });
-
- Ok(())
-}
diff --git a/src/export/html/render.rs b/src/export/html/render.rs
@@ -1,139 +0,0 @@
-use crate::thread::{File, Post};
-
-fn html_escape(s: &str) -> String {
- s.replace('&', "&")
- .replace('<', "<")
- .replace('>', ">")
- .replace('"', """)
-}
-
-/// Converts plain post text to HTML.
-/// - `>>id` → reply link anchor
-/// - Lines starting with `>` (not `>>digit`) → greentext span
-/// - `\n` → `<br>`
-pub fn render_text_to_html(text: &str) -> String {
- let needle = ">>";
-
- let lines: Vec<String> = text.split('\n').map(|line| {
- let escaped = html_escape(line);
-
- // Replace >>id with reply link anchors
- let mut processed = String::with_capacity(escaped.len());
- let mut rest = escaped.as_str();
- while let Some(pos) = rest.find(needle) {
- processed.push_str(&rest[..pos]);
- let after = &rest[pos + needle.len()..];
- let digit_end = after.find(|c: char| !c.is_ascii_digit()).unwrap_or(after.len());
- if digit_end > 0 {
- let id = &after[..digit_end];
- processed.push_str(&format!("<a href=\"#post{id}\" class=\"reply-link\">>>{id}</a>"));
- rest = &after[digit_end..];
- } else {
- processed.push_str(needle);
- rest = after;
- }
- }
- processed.push_str(rest);
-
- // Wrap in greentext span if line starts with > but not >>digit
- let is_greentext = escaped.starts_with(">")
- && !escaped.strip_prefix(needle).is_some_and(|s| s.starts_with(|c: char| c.is_ascii_digit()));
- if is_greentext {
- format!("<span class=\"quote\">{processed}</span>")
- } else {
- processed
- }
- }).collect();
-
- lines.join("<br>\n")
-}
-
-/// Renders a single post to an HTML fragment string.
-pub fn render_post(post: &Post, download_files: bool, download_thumbnails: bool) -> String {
- let mut html = format!("<div class=\"post\" id=\"post{}\">\n", post.id);
-
- html.push_str(" <div class=\"post-head\">\n");
-
- // Subject
- if let Some(ref subject) = post.subject {
- html.push_str(&format!(
- " <span class=\"post-subject\">{}</span>\n",
- html_escape(subject)
- ));
- }
-
- // Name /w mailto/sage
- let name = post.name.as_deref().unwrap_or("Аноним");
- let name_display = if let Some(ref mailto) = post.mailto {
- format!("[{}] {}", mailto, name)
- } else {
- name.to_string()
- };
- html.push_str(&format!(
- " <span class=\"post-name\">{}</span>\n",
- html_escape(&name_display)
- ));
-
- // Time, num, id
- html.push_str(&format!(" <span class=\"post-time\">{}</span>\n", html_escape(&post.time)));
- html.push_str(&format!(" <span class=\"post-num\">{}</span>\n", html_escape(&post.num)));
- html.push_str(&format!(
- " <span class=\"post-id\"><a href=\"#post{0}\">№{0}</a></span>\n",
- post.id
- ));
-
- html.push_str(" </div>\n");
-
- // Images
- html.push_str(&render_images(&post.files, download_files, download_thumbnails));
-
- // Body
- html.push_str(" <div class=\"post-body\">\n");
- if !post.text.is_empty() {
- html.push_str(" ");
- html.push_str(&render_text_to_html(&post.text));
- html.push('\n');
- }
- html.push_str(" </div>\n");
-
- html.push_str("</div>\n");
- html
-}
-
-fn render_images(
- files: &[File],
- download_files: bool,
- download_thumbnails: bool,
-) -> String {
- if files.is_empty() {
- return String::new();
- }
-
- let mut html = String::from(" <div class=\"post-images\">\n");
- for file in files {
- let href = if download_files && !file.url.is_empty() {
- format!("files/{}", file.url.split('/').last().unwrap_or(""))
- } else {
- file.url.clone()
- };
-
- let thumb_filename = file.url_thumb.split('/').last().unwrap_or("").to_string();
- let img_src = if download_thumbnails && !file.url_thumb.is_empty() {
- format!("thumb/{}", thumb_filename)
- } else {
- file.url_thumb.clone()
- };
-
- html.push_str(&format!(
- " <div class=\"post-image\">\n <a href=\"{}\" target=\"_blank\" title=\"{}\">\n <img src=\"{}\" alt=\"\" loading=\"lazy\">\n </a>\n <div class=\"post-image-info\">{} (<a href=\"{}\" target=\"_blank\" class=\"post-image-link\">o</a>, <a href=\"{}\" target=\"_blank\" class=\"post-image-link\">t</a>)</div>\n </div>\n",
- html_escape(&href),
- html_escape(&file.name_orig),
- html_escape(&img_src),
- html_escape(&file.name_orig),
- html_escape(&file.url),
- html_escape(&file.url_thumb),
- ));
- }
- html.push_str(" </div>\n");
- html
-}
diff --git a/src/export/mod.rs b/src/export/mod.rs
@@ -1,8 +0,0 @@
-use crate::{config::Config, events::Reporter, post::Post};
-use anyhow::Result;
-
-pub mod html;
-
-pub trait Export {
- fn export(&self, posts: &[Post], config: &Config, reporter: &dyn Reporter) -> Result<()>;
-}
diff --git a/src/http.rs b/src/http.rs
@@ -1,35 +0,0 @@
-use anyhow::{Context, Result};
-
-use crate::events::{Event, Reporter};
-
-/// GET a URL with up to `attempts` retries, reporting each failure via `reporter`.
-pub fn fetch_with_retry(url: &str, attempts: u32, reporter: &dyn Reporter) -> Result<String> {
- for attempt in 1..=attempts {
- match reqwest::blocking::get(url).and_then(|r| r.text()) {
- Ok(text) => return Ok(text),
- Err(e) => {
- reporter.report(Event::FetchRetrying {
- url: url.to_string(),
- attempt,
- max_attempts: attempts,
- error: e.to_string(),
- });
- if attempt < attempts {
- std::thread::sleep(std::time::Duration::from_secs(3));
- }
- }
- }
- }
- anyhow::bail!("failed to get thread after {attempts} attempts")
-}
-
-/// Download a single URL and write it to `path`.
-pub fn download(url: &str, path: &str) -> Result<()> {
- let bytes = reqwest::blocking::get(url)
- .with_context(|| format!("HTTP GET failed for {}", url))?
- .bytes()
- .context("failed to read response body")?;
- std::fs::write(path, &bytes)
- .with_context(|| format!("failed to write {}", path))?;
- Ok(())
-}
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,9 +0,0 @@
-pub mod config;
-pub mod events;
-pub mod backend;
-pub mod post;
-pub mod http;
-pub mod export;
-
-pub use events::{Reporter, NullReporter};
-pub use export::html::HtmlExporter;
diff --git a/src/lib/config.rs b/src/lib/config.rs
@@ -0,0 +1,12 @@
+use crate::export::ExporterKind;
+
+#[derive(Clone)]
+pub struct Config {
+ pub url: String,
+ pub dir: std::path::PathBuf,
+ pub exporter: ExporterKind,
+ pub thumb: bool,
+ pub files: bool,
+ pub resume: bool,
+ pub download_retries: u32,
+}
diff --git a/src/lib/download.rs b/src/lib/download.rs
@@ -0,0 +1,25 @@
+use anyhow::{anyhow, Result};
+
+/// Downloads a URL, retrying up to `tries` times.
+///
+/// # Errors
+/// Returns an error if all attempts fail or `tries` is 0.
+pub fn download(url: &str, tries: u32) -> Result<reqwest::blocking::Response> {
+ static CLIENT: std::sync::LazyLock<reqwest::blocking::Client> =
+ std::sync::LazyLock::new(reqwest::blocking::Client::new);
+
+ for attempt in 0..tries {
+ if attempt > 0 {
+ std::thread::sleep(std::time::Duration::from_millis(500 * 2u64.pow(attempt)));
+ }
+ let response = CLIENT.get(url).send()?;
+ if response.status().is_success() {
+ return Ok(response);
+ }
+ if response.status().is_client_error() {
+ return Err(anyhow!("client error: {}", response.status()));
+ }
+ }
+
+ Err(anyhow!("failed to download {} after {} tries", url, tries))
+}
diff --git a/src/lib/event.rs b/src/lib/event.rs
@@ -0,0 +1,29 @@
+#[derive(Debug, Clone)]
+pub enum Event {
+ // Thread retrieval
+ GetStarted,
+ GetDone,
+ GetFailed { error: String },
+
+ // Files download
+ DownloadAllStarted,
+ DownloadAllDone,
+ DownloadAllFailed { error: String },
+
+ // File download
+ DownloadStarted { index: usize, max_index: usize },
+ DownloadDone { index: usize, max_index: usize },
+ DownloadSkipped { index: usize, max_index: usize },
+ DownloadFailed { url: String, error: String },
+
+ // Files and thumbnails download
+ DownloadFilesStarted,
+ DownloadFilesDone,
+ DownloadThumbStarted,
+ DownloadThumbDone,
+
+ // Thread export
+ ExportStarted,
+ ExportDone,
+ ExportFailed { error: String },
+}
diff --git a/src/lib/export/html/mod.rs b/src/lib/export/html/mod.rs
@@ -0,0 +1,27 @@
+use crate::{config::Config, post::Post};
+use anyhow::Result;
+use super::Exporter;
+
+mod render;
+
+const TEMPLATE: &str = include_str!("template.html");
+
+#[derive(Clone)]
+pub struct HtmlExporter;
+
+impl Exporter for HtmlExporter {
+ fn export(&self, posts: &[Post], config: &Config) -> Result<()> {
+ anyhow::ensure!(!posts.is_empty(), "No posts to export");
+
+ std::fs::create_dir_all(&config.dir)?;
+ let posts_html = posts
+ .iter()
+ .map(|p| render::render_post(p, config.files, config.thumb))
+ .collect::<Vec<String>>()
+ .join("\n");
+ let index_html = TEMPLATE.replace("{{posts}}", &posts_html);
+ std::fs::write(config.dir.join("index.html"), index_html)?;
+
+ Ok(())
+ }
+}
diff --git a/src/lib/export/html/render.rs b/src/lib/export/html/render.rs
@@ -0,0 +1,140 @@
+use crate::post::{File, Post};
+
+/// Renders a single post to an HTML fragment string.
+/// If download_files or download_thumbnails is true, the links will be converted to local paths
+pub fn render_post(post: &Post, download_files: bool, download_thumbnails: bool) -> String {
+ let mut html = format!("<div class=\"post\" id=\"post{}\">\n", post.id);
+
+ html.push_str(" <div class=\"post-head\">\n");
+
+ // Subject
+ if let Some(ref subject) = post.subject {
+ html.push_str(&format!(
+ " <span class=\"post-subject\">{}</span>\n",
+ html_escape(subject)
+ ));
+ }
+
+ // Name /w mailto/sage
+ let name = post.name.as_deref().unwrap_or("Аноним");
+ let name_display = if let Some(ref mailto) = post.mailto {
+ format!("[{}] {}", mailto, name)
+ } else {
+ name.to_string()
+ };
+ html.push_str(&format!(
+ " <span class=\"post-name\">{}</span>\n",
+ html_escape(&name_display)
+ ));
+
+ // Time, num, id
+ html.push_str(&format!(" <span class=\"post-time\">{}</span>\n", html_escape(&post.time)));
+ html.push_str(&format!(" <span class=\"post-num\">{}</span>\n", html_escape(&post.num)));
+ html.push_str(&format!(
+ " <span class=\"post-id\"><a href=\"#post{0}\">№{0}</a></span>\n",
+ post.id
+ ));
+
+ html.push_str(" </div>\n");
+
+ // Images
+ html.push_str(&render_images(&post.files, download_files, download_thumbnails));
+
+ // Body
+ html.push_str(" <div class=\"post-body\">\n");
+ if !post.text.is_empty() {
+ html.push_str(" ");
+ html.push_str(&render_text_to_html(&post.text));
+ html.push('\n');
+ }
+ html.push_str(" </div>\n");
+
+ html.push_str("</div>\n");
+ html
+}
+
+fn html_escape(s: &str) -> String {
+ s.replace('&', "&")
+ .replace('<', "<")
+ .replace('>', ">")
+ .replace('"', """)
+}
+
+/// Converts plain post text to HTML.
+/// - `>>id` → reply link anchor
+/// - Lines starting with `>` (not `>>digit`) → greentext span
+/// - `\n` → `<br>`
+fn render_text_to_html(text: &str) -> String {
+ let needle = ">>";
+
+ let lines: Vec<String> = text.split('\n').map(|line| {
+ let escaped = html_escape(line);
+
+ // Replace >>id with reply link anchors
+ let mut processed = String::with_capacity(escaped.len());
+ let mut rest = escaped.as_str();
+ while let Some(pos) = rest.find(needle) {
+ processed.push_str(&rest[..pos]);
+ let after = &rest[pos + needle.len()..];
+ let digit_end = after.find(|c: char| !c.is_ascii_digit()).unwrap_or(after.len());
+ if digit_end > 0 {
+ let id = &after[..digit_end];
+ processed.push_str(&format!("<a href=\"#post{id}\" class=\"reply-link\">>>{id}</a>"));
+ rest = &after[digit_end..];
+ } else {
+ processed.push_str(needle);
+ rest = after;
+ }
+ }
+ processed.push_str(rest);
+
+ // Wrap in greentext span if line starts with > but not >>digit
+ let is_greentext = escaped.starts_with(">")
+ && !escaped.strip_prefix(needle).is_some_and(|s| s.starts_with(|c: char| c.is_ascii_digit()));
+ if is_greentext {
+ format!("<span class=\"quote\">{processed}</span>")
+ } else {
+ processed
+ }
+ }).collect();
+
+ lines.join("<br>\n")
+}
+
+fn render_images(
+ files: &[File],
+ download_files: bool,
+ download_thumbnails: bool,
+) -> String {
+ if files.is_empty() {
+ return String::new();
+ }
+
+ let mut html = String::from(" <div class=\"post-images\">\n");
+ for file in files {
+ let href = if download_files && !file.url.is_empty() {
+ format!("files/{}", file.url.split('/').last().unwrap_or(""))
+ } else {
+ file.url.clone()
+ };
+
+ let thumb_filename = file.url_thumb.split('/').last().unwrap_or("").to_string();
+ let img_src = if download_thumbnails && !file.url_thumb.is_empty() {
+ format!("thumb/{}", thumb_filename)
+ } else {
+ file.url_thumb.clone()
+ };
+
+ html.push_str(&format!(
+ " <div class=\"post-image\">\n <a href=\"{}\" target=\"_blank\" title=\"{}\">\n <img src=\"{}\" alt=\"\" loading=\"lazy\">\n </a>\n <div class=\"post-image-info\">{} (<a href=\"{}\" target=\"_blank\" class=\"post-image-link\">o</a>, <a href=\"{}\" target=\"_blank\" class=\"post-image-link\">t</a>)</div>\n </div>\n",
+ html_escape(&href),
+ html_escape(&file.name_orig),
+ html_escape(&img_src),
+ html_escape(&file.name_orig),
+ html_escape(&file.url),
+ html_escape(&file.url_thumb),
+ ));
+ }
+ html.push_str(" </div>\n");
+ html
+}
diff --git a/src/export/html/template.html b/src/lib/export/html/template.html
diff --git a/src/lib/export/mod.rs b/src/lib/export/mod.rs
@@ -0,0 +1,35 @@
+pub mod html;
+
+use super::{config::Config, post::Post};
+
+use anyhow::Result;
+
+use std::str::FromStr;
+
+#[derive(Clone)]
+pub enum ExporterKind {
+ Html(html::HtmlExporter),
+}
+
+pub trait Exporter {
+ fn export(&self, posts: &[Post], config: &Config) -> Result<()>;
+}
+
+impl Exporter for ExporterKind {
+ fn export(&self, posts: &[Post], config: &Config) -> Result<()> {
+ match self {
+ ExporterKind::Html(html) => html.export(posts, config),
+ }
+ }
+}
+
+impl FromStr for ExporterKind {
+ type Err = anyhow::Error;
+
+ fn from_str(s: &str) -> Result<ExporterKind> {
+ match s.to_lowercase().as_str() {
+ "html" => Ok(ExporterKind::Html(html::HtmlExporter {})),
+ _ => anyhow::bail!("unknown exporter: {}", s),
+ }
+ }
+}
diff --git a/src/lib/lib.rs b/src/lib/lib.rs
@@ -0,0 +1,96 @@
+pub mod config;
+pub mod event;
+pub mod export;
+
+mod download;
+mod post;
+
+use crate::post::{Post, File};
+use crate::export::Exporter;
+
+use anyhow::{Result, Context};
+
+use std::sync::mpsc::Sender;
+
+pub const BASE_URL: &str = "https://arhivach.vc";
+
+pub fn run(config: &config::Config, tx: Sender<event::Event>) -> Result<()> {
+ tx.send(event::Event::GetStarted)?;
+ let html = download::download(&config.url, config.download_retries)?.text()?;
+ let posts = Post::parse_posts(&html)
+ .inspect_err(|e| { let _ = tx.send(event::Event::GetFailed { error: format!("{:#}", e) }); })
+ .context("failed to parse posts")?;
+ tx.send(event::Event::GetDone)?;
+
+ tx.send(event::Event::DownloadAllStarted)?;
+ run_download(&posts, &config, tx.clone())
+ .inspect_err(|e| { let _ = tx.send(event::Event::DownloadAllFailed { error: format!("{:#}", e) }); })
+ .context("failed to download files")?;
+ tx.send(event::Event::DownloadAllDone)?;
+
+ tx.send(event::Event::ExportStarted)?;
+ config.exporter.export(&posts, config)
+ .inspect_err(|e| { let _ = tx.send(event::Event::ExportFailed { error: format!("{:#}", e) }); })
+ .context("failed to export")?;
+ tx.send(event::Event::ExportDone)?;
+
+ Ok(())
+}
+
+/// Download files and thumbnails. Send DownloadStarted, DownloadDone and DownloadFailed events
+fn run_download(posts: &[Post], config: &config::Config, tx: Sender<event::Event>) -> Result<()> {
+ std::fs::create_dir_all(&config.dir)?;
+
+ let download_item = |url: &str, filepath: &std::path::PathBuf| -> Result<()> {
+ let result = download::download(url, config.download_retries)?;
+ anyhow::ensure!(result.status().is_success(), "failed to download {}: {}", url, result.status());
+ let bytes = result.bytes()?;
+ anyhow::ensure!(!bytes.is_empty(), "empty file: {}", url);
+ std::fs::write(filepath, bytes)?;
+ Ok(())
+ };
+
+ let download_section = |
+ subdir: &str,
+ get_url: fn(&File) -> (&str, &str),
+ | -> Result<()> {
+ let dir = config.dir.join(subdir);
+ std::fs::create_dir_all(&dir)?;
+
+ let mut index: usize = 1;
+ let max_index: usize = posts.iter().map(|p| p.files.len()).sum();
+ for f in posts.iter().flat_map(|p| &p.files) {
+ tx.send(event::Event::DownloadStarted { index, max_index })?;
+ let (url, fallback) = get_url(f);
+ let filename = url.rsplit("/").next().unwrap_or(fallback).trim();
+ let filepath = dir.join(filename);
+ if config.resume && filepath.exists() {
+ tx.send(event::Event::DownloadSkipped { index, max_index })?;
+ index += 1;
+ continue
+ }
+ match download_item(url, &filepath) {
+ Ok(()) => tx.send(event::Event::DownloadDone{ index, max_index })?,
+ Err(e) => tx.send(event::Event::DownloadFailed {
+ url: url.to_string(),
+ error: format!("{:#}", e)
+ })?
+ };
+ index += 1;
+ }
+ Ok(())
+ };
+
+ if config.files {
+ tx.send(event::Event::DownloadFilesStarted)?;
+ download_section("files", |f| (&f.url, &f.name_timestamp))?;
+ tx.send(event::Event::DownloadFilesDone)?;
+ }
+ if config.thumb {
+ tx.send(event::Event::DownloadThumbStarted)?;
+ download_section("thumb", |f| (&f.url_thumb, &f.name_timestamp))?;
+ tx.send(event::Event::DownloadThumbDone)?;
+ }
+
+ Ok(())
+}
diff --git a/src/lib/post.rs b/src/lib/post.rs
@@ -0,0 +1,374 @@
+use super::BASE_URL;
+
+use anyhow::{Context, Result};
+
+#[derive(Debug, Clone)]
+pub struct File {
+ /// original name, "videolol.mp4"
+ pub name_orig: String,
+ /// timestampname, "17699100670710.mp4"
+ pub name_timestamp: String,
+ /// thumbnail url, "https://arhivach.vc/storage/t/aeaa7825f8d8ffe3f07f242a59b7761c.thumb"
+ pub url_thumb: String,
+ /// url, "https://i.arhivach.vc/storage/a/ea/aeaa7825f8d8ffe3f07f242a59b7761c.mp4"
+ pub url: String,
+}
+
+impl std::fmt::Display for File {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(
+ f,
+ "{} [{}]\n url: {}\n thumb: {}",
+ self.name_orig, self.name_timestamp, self.url, self.url_thumb
+ )
+ }
+}
+
+struct PostHead {
+ subject: Option<String>,
+ name: Option<String>,
+ mailto: Option<String>,
+ time: String,
+ num: String,
+ id: u32,
+}
+
+/// Represents a single post in a thread
+#[derive(Debug, Clone)]
+pub struct Post {
+ /// Empty if None
+ pub subject: Option<String>,
+ /// "Аноним" if none
+ pub name: Option<String>,
+ /// "mailto:sage"
+ pub mailto: Option<String>,
+ /// "01/02/26 Вск 03:13:12"
+ pub time: String,
+ /// "#5"
+ pub num: String,
+ /// "329281515"
+ pub id: u32,
+ pub files: Vec<File>,
+ /// Post text
+ pub text: String,
+}
+
+impl Post {
+ pub fn parse_posts(
+ html: &str,
+ ) -> Result<Vec<Post>> {
+ let mut posts = Vec::new();
+
+ let document = scraper::Html::parse_document(html);
+ let selector = scraper::Selector::parse(r#"div.post"#).unwrap();
+ for node in document.select(&selector) {
+ let post = Post::parse_post(node)?;
+ posts.push(post);
+ }
+
+ Ok(posts)
+ }
+
+ /// Parse div class="post"
+ ///
+ /// Example element:
+ /// ```html
+ /// <div class="post" id="post329274763" postid="329274763">
+ /// <div class="post_head">...</div> (see parse_post_head function)
+ /// <span class="post_comment">...</span> (see parse_post_comment function)
+ /// </div>
+ /// ```
+ fn parse_post(node: scraper::ElementRef) -> Result<Post> {
+ static SEL_POST_HEAD: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("div.post_head").unwrap()
+ );
+ static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("span.post_comment").unwrap()
+ );
+
+ let post_head = node
+ .select(&SEL_POST_HEAD)
+ .next()
+ .context("missing post_head")?;
+ let head = Post::parse_post_head(post_head)?;
+
+ let post_comment = node
+ .select(&SEL_POST_IMAGE_BLOCK)
+ .next()
+ .context("missing post_comment")?;
+ let (files, text) = Post::parse_post_comment(post_comment)?;
+
+ Ok(Post {
+ subject: head.subject,
+ name: head.name,
+ mailto: head.mailto,
+ time: head.time,
+ num: head.num,
+ id: head.id,
+ files,
+ text,
+ })
+ }
+
+ /// Parses the post_head element
+ ///
+ /// Returns (subject, name, mailto, time, num, id)
+ /// Returns error if no time, num or id is found or if id is not a number
+ ///
+ /// Example element:
+ /// ```html
+ /// <div class="post_head">
+ /// <span class="poster_name" title="">Аноним</span>
+ /// <span class="post_time">01/02/26 Вск 04:27:32</span>
+ /// <span class="post_num">#77</span>
+ /// <span class="post_id">
+ /// <a style="position:absolute;margin-top:-50px;" id="329274763"></a>
+ /// <a href="#329274763">№329274763</a>
+ /// </span>
+ /// </div>
+ /// ```
+ fn parse_post_head(post_head: scraper::ElementRef) -> Result<PostHead> {
+ static SEL_SPAN_POST_ID_A_HREF: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("span.post_id a[href]").unwrap()
+ );
+ static SEL_H1_POST_SUBJECT: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("h1.post_subject").unwrap()
+ );
+ static SEL_SPAN_POSTER_NAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("span.poster_name").unwrap()
+ );
+ static SEL_A_POST_MAIL: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("a.post_mail").unwrap()
+ );
+ static SEL_SPAN_POST_TIME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("span.post_time").unwrap()
+ );
+ static SEL_SPAN_POST_NUM: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("span.post_num").unwrap()
+ );
+
+ let id: u32 = post_head
+ .select(&SEL_SPAN_POST_ID_A_HREF)
+ .next()
+ .and_then(|el| el.value().attr("href"))
+ .and_then(|href| href.strip_prefix('#'))
+ .context("missing post id")?
+ .parse()?;
+
+ let subject = post_head
+ .select(&SEL_H1_POST_SUBJECT)
+ .next()
+ .map(|el| el.text().collect::<String>());
+
+ let name = post_head
+ .select(&SEL_SPAN_POSTER_NAME)
+ .next()
+ .map(|el| el.text().collect::<String>())
+ .and_then(|n| if n == "Аноним" { None } else { Some(n) });
+
+ let mailto = post_head
+ .select(&SEL_A_POST_MAIL)
+ .next()
+ .and_then(|el| el.value().attr("title"))
+ .map(|s| s.to_string());
+
+ let time = post_head
+ .select(&SEL_SPAN_POST_TIME)
+ .next()
+ .context("missing post_time")?
+ .text()
+ .collect::<String>();
+
+ let num = post_head
+ .select(&SEL_SPAN_POST_NUM)
+ .next()
+ .context("missing post_num")?
+ .text()
+ .collect::<String>();
+
+ Ok(PostHead { subject, name, mailto, time, num, id })
+ }
+
+ /// Parses the sapn post_comment element from a post element
+ ///
+ /// Returns (files, text)
+ ///
+ /// Example element:
+ /// <span class="post_comment">
+ /// <div class="post_image_block" ...>...</div> (see parse_post_image_block function) (can appear 0 to multiple times)
+ /// <div class="post_comment_body">...</div> (see parse_post_comment_body function)
+ /// </span>
+ fn parse_post_comment(
+ node: scraper::ElementRef,
+ ) -> Result<(Vec<File>, String)> {
+ static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("div.post_image_block").unwrap()
+ );
+ static SEL_POST_COMMENT_BODY: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("div.post_comment_body").unwrap()
+ );
+
+ // TODO handle the errors instead of propagating them upper. Change the return type to non-Result
+ let files: Vec<File> = node
+ .select(&SEL_POST_IMAGE_BLOCK)
+ .map(Post::parse_post_image_block)
+ .collect();
+ let text = Post::parse_post_comment_body(node
+ .select(&SEL_POST_COMMENT_BODY)
+ .next()
+ .context("missing post_comment_body")?);
+ Ok((files, text))
+ }
+
+ /// Parses "post_image_block" element
+ /// Returns File
+ ///
+ /// Example element:
+ /// ```html
+ /// <div class="post_image_block" id="pib_77_2" pib="77_2" title="537.4 Кб, 946 x 946
+ /// image.png
+ /// 17699092523481.png">
+ /// <a class="expand_image" onclick="expand_local('77_2','/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png','946','946',event); return false;" href="#">
+ /// <div class="post_image" id="thumb_77_2">
+ /// <img src="/storage/t/acc7f5856bc60ad3bdbd4dc7027e33f9.png" alt="" loading="lazy"> // thumbnail path
+ /// </div>
+ /// </a>
+ /// <a href="/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png" target="_blank" class="img_filename">image.png</a> // can also be https://i.arhivach.vc/... if it's a video
+ /// </div>
+ /// ```
+ fn parse_post_image_block(pib: scraper::ElementRef) -> File {
+ static SEL_POST_IMAGE_IMG: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse(".post_image img").unwrap()
+ );
+ static SEL_A_IMG_FILENAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
+ || scraper::Selector::parse("a.img_filename").unwrap()
+ );
+
+ // Title example:
+ // 402.2 Кб, 800 x 532
+ // image.png <- name_orig
+ // 17699142349880.png <- name_timestamp
+ let title = pib.value().attr("title").unwrap_or("");
+ let title_lines: Vec<&str> = title.lines().collect();
+ let name_orig = title_lines
+ .get(1)
+ .map(|s| s.to_string())
+ .unwrap_or("unnamed".to_string());
+ let name_timestamp = title_lines
+ .get(2)
+ .map(|s| s.to_string())
+ .unwrap_or("unnamed".to_string());
+
+ // url_thumb
+ let url_thumb = pib
+ .select(&SEL_POST_IMAGE_IMG)
+ .next()
+ .and_then(|el| el.value().attr("src"))
+ .unwrap_or(""); // /storage/t/83c2fe5ba9a8469d9eeef4af124e3b52.thumb
+ let url_thumb = if url_thumb.is_empty() {
+ String::new()
+ } else {
+ format!("{BASE_URL}{url_thumb}")
+ };
+
+ // url
+ let url = pib
+ .select(&SEL_A_IMG_FILENAME)
+ .next()
+ .and_then(|el| el.value().attr("href"))
+ .unwrap_or("");
+ let url = if url.starts_with("http") { // is `https://i.arhivach.vc/...`?
+ url.to_string()
+ } else if url.is_empty() {
+ String::new()
+ } else {
+ format!("{BASE_URL}{url}")
+ };
+
+ File {
+ name_orig,
+ name_timestamp,
+ url_thumb,
+ url,
+ }
+ }
+
+ /// Parses the post text from `div.post_comment_body`
+ ///
+ /// Returns post text:
+ /// - References are plaintext (e.g. >>329274789)
+ /// - `<br>` is replaced with \n
+ /// - `<span class="unkfunc">` (greentext) is replaced with >text
+ ///
+ /// If the text contains a reference (e.g. >>329274789) it looks like this in the element:
+ /// ```html
+ /// <div class="post_comment_body">
+ /// <a href="#329274893" class="post-reply-link" data-thread="329273515" data-num="329274893">>>329274893</a> // This will be replaced with >>329274893
+ /// <br>
+ /// <span class="unkfunc">>greentext1</span>
+ /// <br>
+ /// text1
+ /// </div>
+ /// ```
+ ///
+ /// This example returns:
+ /// ```text
+ /// >>329274893
+ /// >greentext1
+ /// text1
+ /// ```
+ fn parse_post_comment_body(node: scraper::ElementRef) -> String {
+ use scraper::node::Node;
+
+ let mut result = String::new();
+ for child in node.children() {
+ match child.value() {
+ Node::Text(text) => result.push_str(&text.text),
+ Node::Element(el) if el.name() == "br" => result.push('\n'),
+ Node::Element(_) => {
+ if let Some(el_ref) = scraper::ElementRef::wrap(child) {
+ result.push_str(&el_ref.text().collect::<String>());
+ }
+ }
+ _ => {}
+ }
+ }
+ result.trim().to_string()
+ }
+}
+
+impl std::fmt::Display for Post {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ // Header line
+ let name = self.name.as_deref().unwrap_or("Аноним");
+ let mailto = self.mailto.as_deref().unwrap_or("");
+
+ if !mailto.is_empty() {
+ write!(f, "{} ({})", name, mailto)?;
+ } else {
+ write!(f, "{}", name)?;
+ }
+
+ write!(f, " {} {} ID:{}", self.time, self.num, self.id)?;
+
+ // Subject
+ if let Some(ref subject) = self.subject {
+ write!(f, "\n{}", subject)?;
+ }
+
+ // Files
+ if !self.files.is_empty() {
+ write!(f, "\n[Files: {}]", self.files.len())?;
+ for file in &self.files {
+ write!(f, "\n - {}", file)?;
+ }
+ }
+
+ // Post text
+ if !self.text.is_empty() {
+ write!(f, "\n{}", self.text)?;
+ }
+
+ Ok(())
+ }
+}
diff --git a/src/post.rs b/src/post.rs
@@ -1,374 +0,0 @@
-use anyhow::{Context, Result};
-
-const BASE_URL: &str = "https://arhivach.vc";
-
-#[derive(Debug, Clone)]
-pub struct File {
- /// original name, "videolol.mp4"
- pub name_orig: String,
- /// timestampname, "17699100670710.mp4"
- pub name_timestamp: String,
- /// thumbnail url, "https://arhivach.vc/storage/t/aeaa7825f8d8ffe3f07f242a59b7761c.thumb"
- pub url_thumb: String,
- /// url, "https://i.arhivach.vc/storage/a/ea/aeaa7825f8d8ffe3f07f242a59b7761c.mp4"
- pub url: String,
-}
-
-impl std::fmt::Display for File {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(
- f,
- "{} [{}]\n url: {}\n thumb: {}",
- self.name_orig, self.name_timestamp, self.url, self.url_thumb
- )
- }
-}
-
-struct PostHead {
- subject: Option<String>,
- name: Option<String>,
- mailto: Option<String>,
- time: String,
- num: String,
- id: u32,
-}
-
-/// Represents a single post in a thread
-#[derive(Debug, Clone)]
-pub struct Post {
- /// Empty if None
- pub subject: Option<String>,
- /// "Аноним" if none
- pub name: Option<String>,
- /// "mailto:sage"
- pub mailto: Option<String>,
- /// "01/02/26 Вск 03:13:12"
- pub time: String,
- /// "#5"
- pub num: String,
- /// "329281515"
- pub id: u32,
- pub files: Vec<File>,
- /// Post text
- pub text: String,
-}
-
-impl Post {
- pub fn parse_posts(
- html: &str,
- ) -> Result<Vec<Post>> {
- let mut posts = Vec::new();
-
- let document = scraper::Html::parse_document(html);
- let selector = scraper::Selector::parse(r#"div.post"#).unwrap();
- for node in document.select(&selector) {
- let post = Post::parse_post(node)?;
- posts.push(post);
- }
-
- Ok(posts)
- }
-
- /// Parse div class="post"
- ///
- /// Example element:
- /// ```html
- /// <div class="post" id="post329274763" postid="329274763">
- /// <div class="post_head">...</div> (see parse_post_head function)
- /// <span class="post_comment">...</span> (see parse_post_comment function)
- /// </div>
- /// ```
- fn parse_post(node: scraper::ElementRef) -> Result<Post> {
- static SEL_POST_HEAD: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("div.post_head").unwrap()
- );
- static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("span.post_comment").unwrap()
- );
-
- let post_head = node
- .select(&SEL_POST_HEAD)
- .next()
- .context("missing post_head")?;
- let head = Post::parse_post_head(post_head)?;
-
- let post_comment = node
- .select(&SEL_POST_IMAGE_BLOCK)
- .next()
- .context("missing post_comment")?;
- let (files, text) = Post::parse_post_comment(post_comment)?;
-
- Ok(Post {
- subject: head.subject,
- name: head.name,
- mailto: head.mailto,
- time: head.time,
- num: head.num,
- id: head.id,
- files,
- text,
- })
- }
-
- /// Parses the post_head element
- ///
- /// Returns (subject, name, mailto, time, num, id)
- /// Returns error if no time, num or id is found or if id is not a number
- ///
- /// Example element:
- /// ```html
- /// <div class="post_head">
- /// <span class="poster_name" title="">Аноним</span>
- /// <span class="post_time">01/02/26 Вск 04:27:32</span>
- /// <span class="post_num">#77</span>
- /// <span class="post_id">
- /// <a style="position:absolute;margin-top:-50px;" id="329274763"></a>
- /// <a href="#329274763">№329274763</a>
- /// </span>
- /// </div>
- /// ```
- fn parse_post_head(post_head: scraper::ElementRef) -> Result<PostHead> {
- static SEL_SPAN_POST_ID_A_HREF: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("span.post_id a[href]").unwrap()
- );
- static SEL_H1_POST_SUBJECT: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("h1.post_subject").unwrap()
- );
- static SEL_SPAN_POSTER_NAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("span.poster_name").unwrap()
- );
- static SEL_A_POST_MAIL: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("a.post_mail").unwrap()
- );
- static SEL_SPAN_POST_TIME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("span.post_time").unwrap()
- );
- static SEL_SPAN_POST_NUM: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("span.post_num").unwrap()
- );
-
- let id: u32 = post_head
- .select(&SEL_SPAN_POST_ID_A_HREF)
- .next()
- .and_then(|el| el.value().attr("href"))
- .and_then(|href| href.strip_prefix('#'))
- .context("missing post id")?
- .parse()?;
-
- let subject = post_head
- .select(&SEL_H1_POST_SUBJECT)
- .next()
- .map(|el| el.text().collect::<String>());
-
- let name = post_head
- .select(&SEL_SPAN_POSTER_NAME)
- .next()
- .map(|el| el.text().collect::<String>())
- .and_then(|n| if n == "Аноним" { None } else { Some(n) });
-
- let mailto = post_head
- .select(&SEL_A_POST_MAIL)
- .next()
- .and_then(|el| el.value().attr("title"))
- .map(|s| s.to_string());
-
- let time = post_head
- .select(&SEL_SPAN_POST_TIME)
- .next()
- .context("missing post_time")?
- .text()
- .collect::<String>();
-
- let num = post_head
- .select(&SEL_SPAN_POST_NUM)
- .next()
- .context("missing post_num")?
- .text()
- .collect::<String>();
-
- Ok(PostHead { subject, name, mailto, time, num, id })
- }
-
- /// Parses the sapn post_comment element from a post element
- ///
- /// Returns (files, text)
- ///
- /// Example element:
- /// <span class="post_comment">
- /// <div class="post_image_block" ...>...</div> (see parse_post_image_block function) (can appear 0 to multiple times)
- /// <div class="post_comment_body">...</div> (see parse_post_comment_body function)
- /// </span>
- fn parse_post_comment(
- node: scraper::ElementRef,
- ) -> Result<(Vec<File>, String)> {
- static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("div.post_image_block").unwrap()
- );
- static SEL_POST_COMMENT_BODY: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("div.post_comment_body").unwrap()
- );
-
- // TODO handle the errors instead of propagating them upper. Change the return type to non-Result
- let files: Vec<File> = node
- .select(&SEL_POST_IMAGE_BLOCK)
- .map(Post::parse_post_image_block)
- .collect();
- let text = Post::parse_post_comment_body(node
- .select(&SEL_POST_COMMENT_BODY)
- .next()
- .context("missing post_comment_body")?);
- Ok((files, text))
- }
-
- /// Parses "post_image_block" element
- /// Returns File
- ///
- /// Example element:
- /// ```html
- /// <div class="post_image_block" id="pib_77_2" pib="77_2" title="537.4 Кб, 946 x 946
- /// image.png
- /// 17699092523481.png">
- /// <a class="expand_image" onclick="expand_local('77_2','/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png','946','946',event); return false;" href="#">
- /// <div class="post_image" id="thumb_77_2">
- /// <img src="/storage/t/acc7f5856bc60ad3bdbd4dc7027e33f9.png" alt="" loading="lazy"> // thumbnail path
- /// </div>
- /// </a>
- /// <a href="/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png" target="_blank" class="img_filename">image.png</a> // can also be https://i.arhivach.vc/... if it's a video
- /// </div>
- /// ```
- fn parse_post_image_block(pib: scraper::ElementRef) -> File {
- static SEL_POST_IMAGE_IMG: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse(".post_image img").unwrap()
- );
- static SEL_A_IMG_FILENAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
- || scraper::Selector::parse("a.img_filename").unwrap()
- );
-
- // Title example:
- // 402.2 Кб, 800 x 532
- // image.png <- name_orig
- // 17699142349880.png <- name_timestamp
- let title = pib.value().attr("title").unwrap_or("");
- let title_lines: Vec<&str> = title.lines().collect();
- let name_orig = title_lines
- .get(1)
- .map(|s| s.to_string())
- .unwrap_or("unnamed".to_string());
- let name_timestamp = title_lines
- .get(2)
- .map(|s| s.to_string())
- .unwrap_or("unnamed".to_string());
-
- // url_thumb
- let url_thumb = pib
- .select(&SEL_POST_IMAGE_IMG)
- .next()
- .and_then(|el| el.value().attr("src"))
- .unwrap_or(""); // /storage/t/83c2fe5ba9a8469d9eeef4af124e3b52.thumb
- let url_thumb = if url_thumb.is_empty() {
- String::new()
- } else {
- format!("{BASE_URL}{url_thumb}")
- };
-
- // url
- let url = pib
- .select(&SEL_A_IMG_FILENAME)
- .next()
- .and_then(|el| el.value().attr("href"))
- .unwrap_or("");
- let url = if url.starts_with("http") { // is `https://i.arhivach.vc/...`?
- url.to_string()
- } else if url.is_empty() {
- String::new()
- } else {
- format!("{BASE_URL}{url}")
- };
-
- File {
- name_orig,
- name_timestamp,
- url_thumb,
- url,
- }
- }
-
- /// Parses the post text from `div.post_comment_body`
- ///
- /// Returns post text:
- /// - References are plaintext (e.g. >>329274789)
- /// - `<br>` is replaced with \n
- /// - `<span class="unkfunc">` (greentext) is replaced with >text
- ///
- /// If the text contains a reference (e.g. >>329274789) it looks like this in the element:
- /// ```html
- /// <div class="post_comment_body">
- /// <a href="#329274893" class="post-reply-link" data-thread="329273515" data-num="329274893">>>329274893</a> // This will be replaced with >>329274893
- /// <br>
- /// <span class="unkfunc">>greentext1</span>
- /// <br>
- /// text1
- /// </div>
- /// ```
- ///
- /// This example returns:
- /// ```text
- /// >>329274893
- /// >greentext1
- /// text1
- /// ```
- fn parse_post_comment_body(node: scraper::ElementRef) -> String {
- use scraper::node::Node;
-
- let mut result = String::new();
- for child in node.children() {
- match child.value() {
- Node::Text(text) => result.push_str(&text.text),
- Node::Element(el) if el.name() == "br" => result.push('\n'),
- Node::Element(_) => {
- if let Some(el_ref) = scraper::ElementRef::wrap(child) {
- result.push_str(&el_ref.text().collect::<String>());
- }
- }
- _ => {}
- }
- }
- result.trim().to_string()
- }
-}
-
-impl std::fmt::Display for Post {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- // Header line
- let name = self.name.as_deref().unwrap_or("Аноним");
- let mailto = self.mailto.as_deref().unwrap_or("");
-
- if !mailto.is_empty() {
- write!(f, "{} ({})", name, mailto)?;
- } else {
- write!(f, "{}", name)?;
- }
-
- write!(f, " {} {} ID:{}", self.time, self.num, self.id)?;
-
- // Subject
- if let Some(ref subject) = self.subject {
- write!(f, "\n{}", subject)?;
- }
-
- // Files
- if !self.files.is_empty() {
- write!(f, "\n[Files: {}]", self.files.len())?;
- for file in &self.files {
- write!(f, "\n - {}", file)?;
- }
- }
-
- // Post text
- if !self.text.is_empty() {
- write!(f, "\n{}", self.text)?;
- }
-
- Ok(())
- }
-}