From fa0d1f52c3faa38a0a299de7d625c6c17a57a770 Mon Sep 17 00:00:00 2001 From: talwat <83217276+talwat@users.noreply.github.com> Date: Sat, 9 Aug 2025 17:42:04 +0200 Subject: [PATCH] fix: seperate get function from chillhop scraper --- data/chillhop-new.txt | 3 +- src/main.rs | 8 ++--- src/scrapers.rs | 73 +++++++++++++++++++++++++++++++++++++++- src/scrapers/chillhop.rs | 65 ++++------------------------------- src/scrapers/lofigirl.rs | 20 ++++++----- 5 files changed, 95 insertions(+), 74 deletions(-) diff --git a/data/chillhop-new.txt b/data/chillhop-new.txt index 547e569..bb47a2d 100644 --- a/data/chillhop-new.txt +++ b/data/chillhop-new.txt @@ -1,4 +1,3 @@ -https://stream.chillhop.com/mp3/ 9476!Apple Juice 8448!Tôzen 8878!Swiss @@ -1372,4 +1371,4 @@ https://stream.chillhop.com/mp3/ 82333!Driving 85214!My Steeze 82331!Filaments -85273!Shoegaze \ No newline at end of file +85273!Shoegaze diff --git a/src/main.rs b/src/main.rs index b8de708..f63eb95 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,7 @@ mod tracks; mod scrapers; #[cfg(feature = "scrape")] -use crate::scrapers::Sources; +use crate::scrapers::Source; /// An extremely simple lofi player. #[derive(Parser, Clone)] #[command(about, version)] @@ -70,7 +70,7 @@ enum Commands { #[cfg(feature = "scrape")] Scrape { // The source to scrape from. - source: scrapers::Sources, + source: scrapers::Source, /// The file extension to search for, defaults to mp3. #[clap(long, short, default_value = "mp3")] @@ -106,8 +106,8 @@ async fn main() -> eyre::Result<()> { extension, include_full, } => match source { - Sources::Lofigirl => scrapers::lofigirl::scrape(extension, include_full).await?, - Sources::Chillhop => scrapers::chillhop::scrape().await?, + Source::Lofigirl => scrapers::lofigirl::scrape(extension, include_full).await?, + Source::Chillhop => scrapers::chillhop::scrape().await?, }, } } else { diff --git a/src/scrapers.rs b/src/scrapers.rs index 95707e3..c26e8f3 100644 --- a/src/scrapers.rs +++ b/src/scrapers.rs @@ -1,10 +1,81 @@ +use std::path::{Path, PathBuf}; + use clap::ValueEnum; +use eyre::bail; +use reqwest::Client; +use tokio::{ + fs::{self, File}, + io::AsyncWriteExt, +}; pub mod chillhop; pub mod lofigirl; #[derive(Clone, Copy, PartialEq, Eq, Debug, ValueEnum)] -pub enum Sources { +pub enum Source { Lofigirl, Chillhop, } + +impl Source { + pub fn cache_dir(&self) -> &'static str { + match self { + Source::Lofigirl => "lofigirl", + Source::Chillhop => "chillhop", + } + } + + pub fn url(&self) -> &'static str { + match self { + Source::Chillhop => "https://chillhop.com", + Source::Lofigirl => "https://lofigirl.com/wp-content/uploads", + } + } +} + +/// Sends a get request, with caching. +async fn get(client: &Client, path: &str, source: Source) -> eyre::Result { + let trimmed = path.trim_matches('/'); + let cache = PathBuf::from(format!("./cache/{}/{trimmed}.html", source.cache_dir())); + + if let Ok(x) = fs::read_to_string(&cache).await { + Ok(x) + } else { + let resp = client + .get(format!("{}/{trimmed}", source.url())) + .send() + .await?; + + let status = resp.status(); + + if status == 429 { + bail!("rate limit reached: {path}"); + } + + if status != 404 && !status.is_success() && !status.is_redirection() { + bail!("non success code {}: {path}", resp.status().as_u16()); + } + + let text = resp.text().await?; + + let parent = cache.parent(); + if let Some(x) = parent { + if x != Path::new("") { + fs::create_dir_all(x).await?; + } + } + + let mut file = File::create(&cache).await?; + file.write_all(text.as_bytes()).await?; + + if status.is_redirection() { + bail!("redirect: {path}") + } + + if status == 404 { + bail!("not found: {path}") + } + + Ok(text) + } +} diff --git a/src/scrapers/chillhop.rs b/src/scrapers/chillhop.rs index b86529d..3fd9149 100644 --- a/src/scrapers/chillhop.rs +++ b/src/scrapers/chillhop.rs @@ -1,13 +1,10 @@ -use eyre::{bail, eyre}; +use eyre::eyre; use futures::stream::FuturesUnordered; use futures::StreamExt; use indicatif::ProgressBar; use lazy_static::lazy_static; +use std::fmt; use std::str::FromStr; -use std::{ - fmt, - path::{Path, PathBuf}, -}; use reqwest::Client; use scraper::{Html, Selector}; @@ -15,10 +12,9 @@ use serde::{ de::{self, Visitor}, Deserialize, Deserializer, }; -use tokio::{ - fs::{self, File}, - io::AsyncWriteExt, -}; +use tokio::fs; + +use crate::scrapers::{get, Source}; lazy_static! { static ref RELEASES: Selector = Selector::parse(".table-body > a").unwrap(); @@ -70,7 +66,7 @@ impl Release { client: Client, bar: ProgressBar, ) -> Result { - let content = get(&client, &path).await?; + let content = get(&client, &path, Source::Chillhop).await?; let html = Html::parse_document(&content); let textarea = html @@ -89,60 +85,13 @@ impl Release { } } -/// Sends a get request, with caching. -async fn get(client: &Client, path: &str) -> eyre::Result { - let trimmed = path.trim_matches('/'); - let cache = PathBuf::from(format!("./cache/chillhop/{trimmed}.html")); - - if let Ok(x) = fs::read_to_string(&cache).await { - Ok(x) - } else { - let resp = client - .get(format!("https://chillhop.com/{trimmed}")) - .send() - .await?; - - let status = resp.status(); - - if status == 429 { - bail!("rate limit reached: {path}"); - } - - if status != 404 && !status.is_success() && !status.is_redirection() { - bail!("non success code {}: {path}", resp.status().as_u16()); - } - - let text = resp.text().await?; - - let parent = cache.parent(); - if let Some(x) = parent { - if x != Path::new("") { - fs::create_dir_all(x).await?; - } - } - - let mut file = File::create(&cache).await?; - file.write_all(text.as_bytes()).await?; - - if status.is_redirection() { - bail!("redirect: {path}") - } - - if status == 404 { - bail!("not found: {path}") - } - - Ok(text) - } -} - async fn scan_page( number: usize, client: &Client, bar: ProgressBar, ) -> eyre::Result>>> { let path = format!("releases/?page={number}"); - let content = get(client, &path).await?; + let content = get(client, &path, Source::Chillhop).await?; let html = Html::parse_document(&content); let elements = html.select(&RELEASES); diff --git a/src/scrapers/lofigirl.rs b/src/scrapers/lofigirl.rs index 363efe0..9d0f5a0 100644 --- a/src/scrapers/lofigirl.rs +++ b/src/scrapers/lofigirl.rs @@ -5,19 +5,19 @@ use futures::{stream::FuturesOrdered, StreamExt}; use lazy_static::lazy_static; +use reqwest::Client; use scraper::{Html, Selector}; -const BASE_URL: &str = "https://lofigirl.com/wp-content/uploads/"; +use crate::scrapers::{get, Source}; lazy_static! { static ref SELECTOR: Selector = Selector::parse("html > body > pre > a").unwrap(); } -async fn parse(path: &str) -> eyre::Result> { - let response = reqwest::get(format!("{}{}", BASE_URL, path)).await?; - let document = response.text().await?; - +async fn parse(client: &Client, path: &str) -> eyre::Result> { + let document = get(client, path, super::Source::Lofigirl).await?; let html = Html::parse_document(&document); + Ok(html .select(&SELECTOR) .skip(5) @@ -30,9 +30,10 @@ async fn parse(path: &str) -> eyre::Result> { /// It's a bit hacky, and basically works by checking all of the years, then months, and then all of the files. /// This is done as a way to avoid recursion, since async rust really hates recursive functions. async fn scan(extension: &str, include_full: bool) -> eyre::Result> { + let client = Client::new(); let extension = &format!(".{}", extension); - let items = parse("").await?; + let items = parse(&client, "/").await?; let mut years: Vec = items .iter() @@ -48,19 +49,20 @@ async fn scan(extension: &str, include_full: bool) -> eyre::Result> let mut futures = FuturesOrdered::new(); for year in years { - let months = parse(&year.to_string()).await?; + let months = parse(&client, &year.to_string()).await?; for month in months { + let client = client.clone(); futures.push_back(async move { let path = format!("{}/{}", year, month); - let items = parse(&path).await.unwrap(); + let items = parse(&client, &path).await.unwrap(); items .into_iter() .filter_map(|x| { if x.ends_with(extension) { if include_full { - Some(format!("{BASE_URL}{path}{x}")) + Some(format!("{}/{path}{x}", Source::Lofigirl.url())) } else { Some(format!("{path}{x}")) }