post.rs - arhivach-downloader - Download arhivach.vc threads

post.rs (12924B)
      1 use super::BASE_URL;
      2 use crate::error::{Error, Result};
      3 
      4 #[derive(Debug, Clone)]
      5 pub struct File {
      6     /// original name, "videolol.mp4"
      7     pub name_orig: String,
      8     /// timestampname, "17699100670710.mp4"
      9     pub name_timestamp: String,
     10     /// thumbnail url, "https://arhivach.vc/storage/t/aeaa7825f8d8ffe3f07f242a59b7761c.thumb"
     11     pub url_thumb: String,
     12     /// url, "https://i.arhivach.vc/storage/a/ea/aeaa7825f8d8ffe3f07f242a59b7761c.mp4"
     13     pub url: String,
     14 }
     15 
     16 impl std::fmt::Display for File {
     17     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     18         write!(
     19             f,
     20             "{} [{}]\n  url:   {}\n  thumb: {}",
     21             self.name_orig, self.name_timestamp, self.url, self.url_thumb
     22         )
     23     }
     24 }
     25 
     26 struct PostHead {
     27     subject: Option<String>,
     28     name: Option<String>,
     29     mailto: Option<String>,
     30     time: String,
     31     num: String,
     32     id: u32,
     33 }
     34 
     35 /// Represents a single post in a thread
     36 #[derive(Debug, Clone)]
     37 pub struct Post {
     38     /// Empty if None
     39     pub subject: Option<String>,
     40     /// "Аноним" if none
     41     pub name: Option<String>,
     42     /// "mailto:sage"
     43     pub mailto: Option<String>,
     44     /// "01/02/26 Вск 03:13:12"
     45     pub time: String,
     46     /// "#5"
     47     pub num: String,
     48     /// "329281515"
     49     pub id: u32,
     50     pub files: Vec<File>,
     51     /// Post text
     52     pub text: String,
     53 }
     54 
     55 impl Post {
     56     pub fn parse_posts(
     57         html: &str,
     58     ) -> Result<Vec<Post>> {
     59         let mut posts = Vec::new();
     60 
     61         let document = scraper::Html::parse_document(html);
     62         let selector = scraper::Selector::parse(r#"div.post"#).unwrap();
     63         for node in document.select(&selector) {
     64             let post = Post::parse_post(node)?;
     65             posts.push(post);
     66         }
     67 
     68         Ok(posts)
     69     }
     70 
     71     /// Parse div class="post"
     72     ///
     73     /// Example element:
     74     /// ```html
     75     /// <div class="post" id="post329274763" postid="329274763">
     76     ///     <div class="post_head">...</div> (see parse_post_head function)
     77     ///     <span class="post_comment">...</span> (see parse_post_comment function)
     78     /// </div>
     79     /// ```
     80     fn parse_post(node: scraper::ElementRef) -> Result<Post> {
     81         static SEL_POST_HEAD: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
     82             || scraper::Selector::parse("div.post_head").unwrap()
     83         );
     84         static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
     85             || scraper::Selector::parse("span.post_comment").unwrap()
     86         );
     87 
     88         let post_head = node
     89             .select(&SEL_POST_HEAD)
     90             .next()
     91             .ok_or(Error::MissingElement("post_head"))?;
     92         let head = Post::parse_post_head(post_head)?;
     93 
     94         let post_comment = node
     95             .select(&SEL_POST_IMAGE_BLOCK)
     96             .next()
     97             .ok_or(Error::MissingElement("post_comment"))?;
     98         let (files, text) = Post::parse_post_comment(post_comment)?;
     99 
    100         Ok(Post {
    101             subject: head.subject,
    102             name: head.name,
    103             mailto: head.mailto,
    104             time: head.time,
    105             num: head.num,
    106             id: head.id,
    107             files,
    108             text,
    109         })
    110     }
    111 
    112     /// Parses the post_head element
    113     ///
    114     /// Returns (subject, name, mailto, time, num, id)
    115     /// Returns error if no time, num or id is found or if id is not a number
    116     ///
    117     /// Example element:
    118     /// ```html
    119     /// <div class="post_head">
    120     ///     <span class="poster_name" title="">Аноним</span>&nbsp;
    121     ///     <span class="post_time">01/02/26 Вск 04:27:32</span>&nbsp;
    122     ///     <span class="post_num">#77</span>&nbsp;
    123     ///     <span class="post_id">
    124     ///         <a style="position:absolute;margin-top:-50px;" id="329274763"></a>
    125     ///         <a href="#329274763">№329274763</a>
    126     ///     </span> &nbsp;
    127     /// </div>
    128     /// ```
    129     fn parse_post_head(post_head: scraper::ElementRef) -> Result<PostHead> {
    130         static SEL_SPAN_POST_ID_A_HREF: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
    131             || scraper::Selector::parse("span.post_id a[href]").unwrap()
    132         );
    133         static SEL_H1_POST_SUBJECT: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
    134             || scraper::Selector::parse("h1.post_subject").unwrap()
    135         );
    136         static SEL_SPAN_POSTER_NAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
    137             || scraper::Selector::parse("span.poster_name").unwrap()
    138         );
    139         static SEL_A_POST_MAIL: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
    140             || scraper::Selector::parse("a.post_mail").unwrap()
    141         );
    142         static SEL_SPAN_POST_TIME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
    143             || scraper::Selector::parse("span.post_time").unwrap()
    144         );
    145         static SEL_SPAN_POST_NUM: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
    146             || scraper::Selector::parse("span.post_num").unwrap()
    147         );
    148 
    149         let id: u32 = post_head
    150             .select(&SEL_SPAN_POST_ID_A_HREF)
    151             .next()
    152             .and_then(|el| el.value().attr("href"))
    153             .and_then(|href| href.strip_prefix('#'))
    154             .ok_or(Error::MissingElement("post id"))?
    155             .parse()?;
    156 
    157         let subject = post_head
    158             .select(&SEL_H1_POST_SUBJECT)
    159             .next()
    160             .map(|el| el.text().collect::<String>());
    161 
    162         let name = post_head
    163             .select(&SEL_SPAN_POSTER_NAME)
    164             .next()
    165             .map(|el| el.text().collect::<String>())
    166             .and_then(|n| if n == "Аноним" { None } else { Some(n) });
    167 
    168         let mailto = post_head
    169             .select(&SEL_A_POST_MAIL)
    170             .next()
    171             .and_then(|el| el.value().attr("title"))
    172             .map(|s| s.to_string());
    173 
    174         let time = post_head
    175             .select(&SEL_SPAN_POST_TIME)
    176             .next()
    177             .ok_or(Error::MissingElement("post_time"))?
    178             .text()
    179             .collect::<String>();
    180 
    181         let num = post_head
    182             .select(&SEL_SPAN_POST_NUM)
    183             .next()
    184             .ok_or(Error::MissingElement("post_num"))?
    185             .text()
    186             .collect::<String>();
    187 
    188         Ok(PostHead { subject, name, mailto, time, num, id })
    189     }
    190 
    191     /// Parses the sapn post_comment element from a post element
    192     ///
    193     /// Returns (files, text)
    194     ///
    195     /// Example element:
    196     /// <span class="post_comment">
    197     ///     <div class="post_image_block" ...>...</div> (see parse_post_image_block function) (can appear 0 to multiple times)
    198     ///     <div class="post_comment_body">...</div> (see parse_post_comment_body function)
    199     /// </span>
    200     fn parse_post_comment(
    201         node: scraper::ElementRef,
    202     ) -> Result<(Vec<File>, String)> {
    203         static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
    204             || scraper::Selector::parse("div.post_image_block").unwrap()
    205         );
    206         static SEL_POST_COMMENT_BODY: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
    207             || scraper::Selector::parse("div.post_comment_body").unwrap()
    208         );
    209 
    210         // TODO handle the errors instead of propagating them upper. Change the return type to non-Result
    211         let files: Vec<File> = node
    212             .select(&SEL_POST_IMAGE_BLOCK)
    213             .map(Post::parse_post_image_block)
    214             .collect();
    215         let text = Post::parse_post_comment_body(node
    216             .select(&SEL_POST_COMMENT_BODY)
    217             .next()
    218             .ok_or(Error::MissingElement("post_comment_body"))?);
    219         Ok((files, text))
    220     }
    221 
    222     /// Parses "post_image_block" element
    223     /// Returns File
    224     ///
    225     /// Example element:
    226     /// ```html
    227     /// <div class="post_image_block" id="pib_77_2" pib="77_2" title="537.4 Кб, 946 x 946
    228     /// image.png
    229     /// 17699092523481.png">
    230     ///     <a class="expand_image" onclick="expand_local('77_2','/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png','946','946',event); return false;" href="#">
    231     ///         <div class="post_image" id="thumb_77_2">
    232     ///             <img src="/storage/t/acc7f5856bc60ad3bdbd4dc7027e33f9.png" alt="" loading="lazy"> // thumbnail path
    233     ///         </div>
    234     ///     </a>
    235     ///     <a href="/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png" target="_blank" class="img_filename">image.png</a> // can also be https://i.arhivach.vc/... if it's a video
    236     /// </div>
    237     /// ```
    238     fn parse_post_image_block(pib: scraper::ElementRef) -> File {
    239         static SEL_POST_IMAGE_IMG: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
    240             || scraper::Selector::parse(".post_image img").unwrap()
    241         );
    242         static SEL_A_IMG_FILENAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new(
    243             || scraper::Selector::parse("a.img_filename").unwrap()
    244         );
    245 
    246         // Title example:
    247         // 402.2 Кб, 800 x 532
    248         // image.png <- name_orig
    249         // 17699142349880.png <- name_timestamp
    250         let title = pib.value().attr("title").unwrap_or("");
    251         let title_lines: Vec<&str> = title.lines().collect();
    252         let name_orig      = title_lines
    253             .get(1)
    254             .map(|s| s.to_string())
    255             .unwrap_or("unnamed".to_string());
    256         let name_timestamp = title_lines
    257             .get(2)
    258             .map(|s| s.to_string())
    259             .unwrap_or("unnamed".to_string());
    260 
    261         // url_thumb
    262         let url_thumb = pib
    263             .select(&SEL_POST_IMAGE_IMG)
    264             .next()
    265             .and_then(|el| el.value().attr("src"))
    266             .unwrap_or(""); // /storage/t/83c2fe5ba9a8469d9eeef4af124e3b52.thumb
    267         let url_thumb = if url_thumb.is_empty() {
    268             String::new()
    269         } else {
    270             format!("{BASE_URL}{url_thumb}")
    271         };
    272 
    273         // url
    274         let url = pib
    275             .select(&SEL_A_IMG_FILENAME)
    276             .next()
    277             .and_then(|el| el.value().attr("href"))
    278             .unwrap_or("");
    279         let url = if url.starts_with("http") { // is `https://i.arhivach.vc/...`?
    280             url.to_string()
    281         } else if url.is_empty() {
    282             String::new()
    283         } else {
    284             format!("{BASE_URL}{url}")
    285         };
    286 
    287         File {
    288             name_orig,
    289             name_timestamp,
    290             url_thumb,
    291             url,
    292         }
    293     }
    294 
    295     /// Parses the post text from `div.post_comment_body`
    296     ///
    297     /// Returns post text:
    298     /// - References are plaintext (e.g. >>329274789)
    299     /// - `<br>` is replaced with \n
    300     /// - `<span class="unkfunc">` (greentext) is replaced with >text
    301     ///
    302     /// If the text contains a reference (e.g. >>329274789) it looks like this in the element:
    303     /// ```html
    304     /// <div class="post_comment_body">
    305     ///     <a href="#329274893" class="post-reply-link" data-thread="329273515" data-num="329274893">&gt;&gt;329274893</a> // This will be replaced with >>329274893
    306     ///     <br>
    307     ///     <span class="unkfunc">&gt;greentext1</span>
    308     ///     <br>
    309     ///     text1
    310     /// </div>
    311     /// ```
    312     ///
    313     /// This example returns:
    314     /// ```text
    315     /// >>329274893
    316     /// >greentext1
    317     /// text1
    318     /// ```
    319     fn parse_post_comment_body(node: scraper::ElementRef) -> String {
    320         use scraper::node::Node;
    321 
    322         let mut result = String::new();
    323         for child in node.children() {
    324             match child.value() {
    325                 Node::Text(text) => result.push_str(&text.text),
    326                 Node::Element(el) if el.name() == "br" => result.push('\n'),
    327                 Node::Element(_) => {
    328                     if let Some(el_ref) = scraper::ElementRef::wrap(child) {
    329                         result.push_str(&el_ref.text().collect::<String>());
    330                     }
    331                 }
    332                 _ => {}
    333             }
    334         }
    335         result.trim().to_string()
    336     }
    337 }
    338 
    339 impl std::fmt::Display for Post {
    340     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    341         // Header line
    342         let name = self.name.as_deref().unwrap_or("Аноним");
    343         let mailto = self.mailto.as_deref().unwrap_or("");
    344 
    345         if !mailto.is_empty() {
    346             write!(f, "{} ({})", name, mailto)?;
    347         } else {
    348             write!(f, "{}", name)?;
    349         }
    350 
    351         write!(f, " {} {} ID:{}", self.time, self.num, self.id)?;
    352 
    353         // Subject
    354         if let Some(ref subject) = self.subject {
    355             write!(f, "\n{}", subject)?;
    356         }
    357 
    358         // Files
    359         if !self.files.is_empty() {
    360             write!(f, "\n[Files: {}]", self.files.len())?;
    361             for file in &self.files {
    362                 write!(f, "\n  - {}", file)?;
    363             }
    364         }
    365 
    366         // Post text
    367         if !self.text.is_empty() {
    368             write!(f, "\n{}", self.text)?;
    369         }
    370 
    371         Ok(())
    372     }
    373 }
	arhivach-downloader Download arhivach.vc threads
	git clone https://git.ea.contact/arhivach-downloader
	Log \| Files \| Refs \| README