post.rs (12924B)
1 use super::BASE_URL; 2 use crate::error::{Error, Result}; 3 4 #[derive(Debug, Clone)] 5 pub struct File { 6 /// original name, "videolol.mp4" 7 pub name_orig: String, 8 /// timestampname, "17699100670710.mp4" 9 pub name_timestamp: String, 10 /// thumbnail url, "https://arhivach.vc/storage/t/aeaa7825f8d8ffe3f07f242a59b7761c.thumb" 11 pub url_thumb: String, 12 /// url, "https://i.arhivach.vc/storage/a/ea/aeaa7825f8d8ffe3f07f242a59b7761c.mp4" 13 pub url: String, 14 } 15 16 impl std::fmt::Display for File { 17 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 18 write!( 19 f, 20 "{} [{}]\n url: {}\n thumb: {}", 21 self.name_orig, self.name_timestamp, self.url, self.url_thumb 22 ) 23 } 24 } 25 26 struct PostHead { 27 subject: Option<String>, 28 name: Option<String>, 29 mailto: Option<String>, 30 time: String, 31 num: String, 32 id: u32, 33 } 34 35 /// Represents a single post in a thread 36 #[derive(Debug, Clone)] 37 pub struct Post { 38 /// Empty if None 39 pub subject: Option<String>, 40 /// "Аноним" if none 41 pub name: Option<String>, 42 /// "mailto:sage" 43 pub mailto: Option<String>, 44 /// "01/02/26 Вск 03:13:12" 45 pub time: String, 46 /// "#5" 47 pub num: String, 48 /// "329281515" 49 pub id: u32, 50 pub files: Vec<File>, 51 /// Post text 52 pub text: String, 53 } 54 55 impl Post { 56 pub fn parse_posts( 57 html: &str, 58 ) -> Result<Vec<Post>> { 59 let mut posts = Vec::new(); 60 61 let document = scraper::Html::parse_document(html); 62 let selector = scraper::Selector::parse(r#"div.post"#).unwrap(); 63 for node in document.select(&selector) { 64 let post = Post::parse_post(node)?; 65 posts.push(post); 66 } 67 68 Ok(posts) 69 } 70 71 /// Parse div class="post" 72 /// 73 /// Example element: 74 /// ```html 75 /// <div class="post" id="post329274763" postid="329274763"> 76 /// <div class="post_head">...</div> (see parse_post_head function) 77 /// <span class="post_comment">...</span> (see parse_post_comment function) 78 /// </div> 79 /// ``` 80 fn parse_post(node: scraper::ElementRef) -> Result<Post> { 81 static SEL_POST_HEAD: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 82 || scraper::Selector::parse("div.post_head").unwrap() 83 ); 84 static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 85 || scraper::Selector::parse("span.post_comment").unwrap() 86 ); 87 88 let post_head = node 89 .select(&SEL_POST_HEAD) 90 .next() 91 .ok_or(Error::MissingElement("post_head"))?; 92 let head = Post::parse_post_head(post_head)?; 93 94 let post_comment = node 95 .select(&SEL_POST_IMAGE_BLOCK) 96 .next() 97 .ok_or(Error::MissingElement("post_comment"))?; 98 let (files, text) = Post::parse_post_comment(post_comment)?; 99 100 Ok(Post { 101 subject: head.subject, 102 name: head.name, 103 mailto: head.mailto, 104 time: head.time, 105 num: head.num, 106 id: head.id, 107 files, 108 text, 109 }) 110 } 111 112 /// Parses the post_head element 113 /// 114 /// Returns (subject, name, mailto, time, num, id) 115 /// Returns error if no time, num or id is found or if id is not a number 116 /// 117 /// Example element: 118 /// ```html 119 /// <div class="post_head"> 120 /// <span class="poster_name" title="">Аноним</span> 121 /// <span class="post_time">01/02/26 Вск 04:27:32</span> 122 /// <span class="post_num">#77</span> 123 /// <span class="post_id"> 124 /// <a style="position:absolute;margin-top:-50px;" id="329274763"></a> 125 /// <a href="#329274763">№329274763</a> 126 /// </span> 127 /// </div> 128 /// ``` 129 fn parse_post_head(post_head: scraper::ElementRef) -> Result<PostHead> { 130 static SEL_SPAN_POST_ID_A_HREF: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 131 || scraper::Selector::parse("span.post_id a[href]").unwrap() 132 ); 133 static SEL_H1_POST_SUBJECT: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 134 || scraper::Selector::parse("h1.post_subject").unwrap() 135 ); 136 static SEL_SPAN_POSTER_NAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 137 || scraper::Selector::parse("span.poster_name").unwrap() 138 ); 139 static SEL_A_POST_MAIL: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 140 || scraper::Selector::parse("a.post_mail").unwrap() 141 ); 142 static SEL_SPAN_POST_TIME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 143 || scraper::Selector::parse("span.post_time").unwrap() 144 ); 145 static SEL_SPAN_POST_NUM: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 146 || scraper::Selector::parse("span.post_num").unwrap() 147 ); 148 149 let id: u32 = post_head 150 .select(&SEL_SPAN_POST_ID_A_HREF) 151 .next() 152 .and_then(|el| el.value().attr("href")) 153 .and_then(|href| href.strip_prefix('#')) 154 .ok_or(Error::MissingElement("post id"))? 155 .parse()?; 156 157 let subject = post_head 158 .select(&SEL_H1_POST_SUBJECT) 159 .next() 160 .map(|el| el.text().collect::<String>()); 161 162 let name = post_head 163 .select(&SEL_SPAN_POSTER_NAME) 164 .next() 165 .map(|el| el.text().collect::<String>()) 166 .and_then(|n| if n == "Аноним" { None } else { Some(n) }); 167 168 let mailto = post_head 169 .select(&SEL_A_POST_MAIL) 170 .next() 171 .and_then(|el| el.value().attr("title")) 172 .map(|s| s.to_string()); 173 174 let time = post_head 175 .select(&SEL_SPAN_POST_TIME) 176 .next() 177 .ok_or(Error::MissingElement("post_time"))? 178 .text() 179 .collect::<String>(); 180 181 let num = post_head 182 .select(&SEL_SPAN_POST_NUM) 183 .next() 184 .ok_or(Error::MissingElement("post_num"))? 185 .text() 186 .collect::<String>(); 187 188 Ok(PostHead { subject, name, mailto, time, num, id }) 189 } 190 191 /// Parses the sapn post_comment element from a post element 192 /// 193 /// Returns (files, text) 194 /// 195 /// Example element: 196 /// <span class="post_comment"> 197 /// <div class="post_image_block" ...>...</div> (see parse_post_image_block function) (can appear 0 to multiple times) 198 /// <div class="post_comment_body">...</div> (see parse_post_comment_body function) 199 /// </span> 200 fn parse_post_comment( 201 node: scraper::ElementRef, 202 ) -> Result<(Vec<File>, String)> { 203 static SEL_POST_IMAGE_BLOCK: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 204 || scraper::Selector::parse("div.post_image_block").unwrap() 205 ); 206 static SEL_POST_COMMENT_BODY: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 207 || scraper::Selector::parse("div.post_comment_body").unwrap() 208 ); 209 210 // TODO handle the errors instead of propagating them upper. Change the return type to non-Result 211 let files: Vec<File> = node 212 .select(&SEL_POST_IMAGE_BLOCK) 213 .map(Post::parse_post_image_block) 214 .collect(); 215 let text = Post::parse_post_comment_body(node 216 .select(&SEL_POST_COMMENT_BODY) 217 .next() 218 .ok_or(Error::MissingElement("post_comment_body"))?); 219 Ok((files, text)) 220 } 221 222 /// Parses "post_image_block" element 223 /// Returns File 224 /// 225 /// Example element: 226 /// ```html 227 /// <div class="post_image_block" id="pib_77_2" pib="77_2" title="537.4 Кб, 946 x 946 228 /// image.png 229 /// 17699092523481.png"> 230 /// <a class="expand_image" onclick="expand_local('77_2','/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png','946','946',event); return false;" href="#"> 231 /// <div class="post_image" id="thumb_77_2"> 232 /// <img src="/storage/t/acc7f5856bc60ad3bdbd4dc7027e33f9.png" alt="" loading="lazy"> // thumbnail path 233 /// </div> 234 /// </a> 235 /// <a href="/storage/a/cc/acc7f5856bc60ad3bdbd4dc7027e33f9.png" target="_blank" class="img_filename">image.png</a> // can also be https://i.arhivach.vc/... if it's a video 236 /// </div> 237 /// ``` 238 fn parse_post_image_block(pib: scraper::ElementRef) -> File { 239 static SEL_POST_IMAGE_IMG: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 240 || scraper::Selector::parse(".post_image img").unwrap() 241 ); 242 static SEL_A_IMG_FILENAME: std::sync::LazyLock<scraper::Selector> = std::sync::LazyLock::new( 243 || scraper::Selector::parse("a.img_filename").unwrap() 244 ); 245 246 // Title example: 247 // 402.2 Кб, 800 x 532 248 // image.png <- name_orig 249 // 17699142349880.png <- name_timestamp 250 let title = pib.value().attr("title").unwrap_or(""); 251 let title_lines: Vec<&str> = title.lines().collect(); 252 let name_orig = title_lines 253 .get(1) 254 .map(|s| s.to_string()) 255 .unwrap_or("unnamed".to_string()); 256 let name_timestamp = title_lines 257 .get(2) 258 .map(|s| s.to_string()) 259 .unwrap_or("unnamed".to_string()); 260 261 // url_thumb 262 let url_thumb = pib 263 .select(&SEL_POST_IMAGE_IMG) 264 .next() 265 .and_then(|el| el.value().attr("src")) 266 .unwrap_or(""); // /storage/t/83c2fe5ba9a8469d9eeef4af124e3b52.thumb 267 let url_thumb = if url_thumb.is_empty() { 268 String::new() 269 } else { 270 format!("{BASE_URL}{url_thumb}") 271 }; 272 273 // url 274 let url = pib 275 .select(&SEL_A_IMG_FILENAME) 276 .next() 277 .and_then(|el| el.value().attr("href")) 278 .unwrap_or(""); 279 let url = if url.starts_with("http") { // is `https://i.arhivach.vc/...`? 280 url.to_string() 281 } else if url.is_empty() { 282 String::new() 283 } else { 284 format!("{BASE_URL}{url}") 285 }; 286 287 File { 288 name_orig, 289 name_timestamp, 290 url_thumb, 291 url, 292 } 293 } 294 295 /// Parses the post text from `div.post_comment_body` 296 /// 297 /// Returns post text: 298 /// - References are plaintext (e.g. >>329274789) 299 /// - `<br>` is replaced with \n 300 /// - `<span class="unkfunc">` (greentext) is replaced with >text 301 /// 302 /// If the text contains a reference (e.g. >>329274789) it looks like this in the element: 303 /// ```html 304 /// <div class="post_comment_body"> 305 /// <a href="#329274893" class="post-reply-link" data-thread="329273515" data-num="329274893">>>329274893</a> // This will be replaced with >>329274893 306 /// <br> 307 /// <span class="unkfunc">>greentext1</span> 308 /// <br> 309 /// text1 310 /// </div> 311 /// ``` 312 /// 313 /// This example returns: 314 /// ```text 315 /// >>329274893 316 /// >greentext1 317 /// text1 318 /// ``` 319 fn parse_post_comment_body(node: scraper::ElementRef) -> String { 320 use scraper::node::Node; 321 322 let mut result = String::new(); 323 for child in node.children() { 324 match child.value() { 325 Node::Text(text) => result.push_str(&text.text), 326 Node::Element(el) if el.name() == "br" => result.push('\n'), 327 Node::Element(_) => { 328 if let Some(el_ref) = scraper::ElementRef::wrap(child) { 329 result.push_str(&el_ref.text().collect::<String>()); 330 } 331 } 332 _ => {} 333 } 334 } 335 result.trim().to_string() 336 } 337 } 338 339 impl std::fmt::Display for Post { 340 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 341 // Header line 342 let name = self.name.as_deref().unwrap_or("Аноним"); 343 let mailto = self.mailto.as_deref().unwrap_or(""); 344 345 if !mailto.is_empty() { 346 write!(f, "{} ({})", name, mailto)?; 347 } else { 348 write!(f, "{}", name)?; 349 } 350 351 write!(f, " {} {} ID:{}", self.time, self.num, self.id)?; 352 353 // Subject 354 if let Some(ref subject) = self.subject { 355 write!(f, "\n{}", subject)?; 356 } 357 358 // Files 359 if !self.files.is_empty() { 360 write!(f, "\n[Files: {}]", self.files.len())?; 361 for file in &self.files { 362 write!(f, "\n - {}", file)?; 363 } 364 } 365 366 // Post text 367 if !self.text.is_empty() { 368 write!(f, "\n{}", self.text)?; 369 } 370 371 Ok(()) 372 } 373 }