fix: seperate get function from chillhop scraper

This commit is contained in:
talwat 2025-08-09 17:42:04 +02:00
parent 4efd8e9e34
commit fa0d1f52c3
5 changed files with 95 additions and 74 deletions

View File

@ -1,4 +1,3 @@
https://stream.chillhop.com/mp3/
9476!Apple Juice 9476!Apple Juice
8448!Tôzen 8448!Tôzen
8878!Swiss 8878!Swiss
@ -1372,4 +1371,4 @@ https://stream.chillhop.com/mp3/
82333!Driving 82333!Driving
85214!My Steeze 85214!My Steeze
82331!Filaments 82331!Filaments
85273!Shoegaze 85273!Shoegaze

View File

@ -15,7 +15,7 @@ mod tracks;
mod scrapers; mod scrapers;
#[cfg(feature = "scrape")] #[cfg(feature = "scrape")]
use crate::scrapers::Sources; use crate::scrapers::Source;
/// An extremely simple lofi player. /// An extremely simple lofi player.
#[derive(Parser, Clone)] #[derive(Parser, Clone)]
#[command(about, version)] #[command(about, version)]
@ -70,7 +70,7 @@ enum Commands {
#[cfg(feature = "scrape")] #[cfg(feature = "scrape")]
Scrape { Scrape {
// The source to scrape from. // The source to scrape from.
source: scrapers::Sources, source: scrapers::Source,
/// The file extension to search for, defaults to mp3. /// The file extension to search for, defaults to mp3.
#[clap(long, short, default_value = "mp3")] #[clap(long, short, default_value = "mp3")]
@ -106,8 +106,8 @@ async fn main() -> eyre::Result<()> {
extension, extension,
include_full, include_full,
} => match source { } => match source {
Sources::Lofigirl => scrapers::lofigirl::scrape(extension, include_full).await?, Source::Lofigirl => scrapers::lofigirl::scrape(extension, include_full).await?,
Sources::Chillhop => scrapers::chillhop::scrape().await?, Source::Chillhop => scrapers::chillhop::scrape().await?,
}, },
} }
} else { } else {

View File

@ -1,10 +1,81 @@
use std::path::{Path, PathBuf};
use clap::ValueEnum; use clap::ValueEnum;
use eyre::bail;
use reqwest::Client;
use tokio::{
fs::{self, File},
io::AsyncWriteExt,
};
pub mod chillhop; pub mod chillhop;
pub mod lofigirl; pub mod lofigirl;
#[derive(Clone, Copy, PartialEq, Eq, Debug, ValueEnum)] #[derive(Clone, Copy, PartialEq, Eq, Debug, ValueEnum)]
pub enum Sources { pub enum Source {
Lofigirl, Lofigirl,
Chillhop, Chillhop,
} }
impl Source {
pub fn cache_dir(&self) -> &'static str {
match self {
Source::Lofigirl => "lofigirl",
Source::Chillhop => "chillhop",
}
}
pub fn url(&self) -> &'static str {
match self {
Source::Chillhop => "https://chillhop.com",
Source::Lofigirl => "https://lofigirl.com/wp-content/uploads",
}
}
}
/// Sends a get request, with caching.
async fn get(client: &Client, path: &str, source: Source) -> eyre::Result<String> {
let trimmed = path.trim_matches('/');
let cache = PathBuf::from(format!("./cache/{}/{trimmed}.html", source.cache_dir()));
if let Ok(x) = fs::read_to_string(&cache).await {
Ok(x)
} else {
let resp = client
.get(format!("{}/{trimmed}", source.url()))
.send()
.await?;
let status = resp.status();
if status == 429 {
bail!("rate limit reached: {path}");
}
if status != 404 && !status.is_success() && !status.is_redirection() {
bail!("non success code {}: {path}", resp.status().as_u16());
}
let text = resp.text().await?;
let parent = cache.parent();
if let Some(x) = parent {
if x != Path::new("") {
fs::create_dir_all(x).await?;
}
}
let mut file = File::create(&cache).await?;
file.write_all(text.as_bytes()).await?;
if status.is_redirection() {
bail!("redirect: {path}")
}
if status == 404 {
bail!("not found: {path}")
}
Ok(text)
}
}

View File

@ -1,13 +1,10 @@
use eyre::{bail, eyre}; use eyre::eyre;
use futures::stream::FuturesUnordered; use futures::stream::FuturesUnordered;
use futures::StreamExt; use futures::StreamExt;
use indicatif::ProgressBar; use indicatif::ProgressBar;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use std::fmt;
use std::str::FromStr; use std::str::FromStr;
use std::{
fmt,
path::{Path, PathBuf},
};
use reqwest::Client; use reqwest::Client;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
@ -15,10 +12,9 @@ use serde::{
de::{self, Visitor}, de::{self, Visitor},
Deserialize, Deserializer, Deserialize, Deserializer,
}; };
use tokio::{ use tokio::fs;
fs::{self, File},
io::AsyncWriteExt, use crate::scrapers::{get, Source};
};
lazy_static! { lazy_static! {
static ref RELEASES: Selector = Selector::parse(".table-body > a").unwrap(); static ref RELEASES: Selector = Selector::parse(".table-body > a").unwrap();
@ -70,7 +66,7 @@ impl Release {
client: Client, client: Client,
bar: ProgressBar, bar: ProgressBar,
) -> Result<Self, ReleaseError> { ) -> Result<Self, ReleaseError> {
let content = get(&client, &path).await?; let content = get(&client, &path, Source::Chillhop).await?;
let html = Html::parse_document(&content); let html = Html::parse_document(&content);
let textarea = html let textarea = html
@ -89,60 +85,13 @@ impl Release {
} }
} }
/// Sends a get request, with caching.
async fn get(client: &Client, path: &str) -> eyre::Result<String> {
let trimmed = path.trim_matches('/');
let cache = PathBuf::from(format!("./cache/chillhop/{trimmed}.html"));
if let Ok(x) = fs::read_to_string(&cache).await {
Ok(x)
} else {
let resp = client
.get(format!("https://chillhop.com/{trimmed}"))
.send()
.await?;
let status = resp.status();
if status == 429 {
bail!("rate limit reached: {path}");
}
if status != 404 && !status.is_success() && !status.is_redirection() {
bail!("non success code {}: {path}", resp.status().as_u16());
}
let text = resp.text().await?;
let parent = cache.parent();
if let Some(x) = parent {
if x != Path::new("") {
fs::create_dir_all(x).await?;
}
}
let mut file = File::create(&cache).await?;
file.write_all(text.as_bytes()).await?;
if status.is_redirection() {
bail!("redirect: {path}")
}
if status == 404 {
bail!("not found: {path}")
}
Ok(text)
}
}
async fn scan_page( async fn scan_page(
number: usize, number: usize,
client: &Client, client: &Client,
bar: ProgressBar, bar: ProgressBar,
) -> eyre::Result<Vec<impl futures::Future<Output = Result<Release, ReleaseError>>>> { ) -> eyre::Result<Vec<impl futures::Future<Output = Result<Release, ReleaseError>>>> {
let path = format!("releases/?page={number}"); let path = format!("releases/?page={number}");
let content = get(client, &path).await?; let content = get(client, &path, Source::Chillhop).await?;
let html = Html::parse_document(&content); let html = Html::parse_document(&content);
let elements = html.select(&RELEASES); let elements = html.select(&RELEASES);

View File

@ -5,19 +5,19 @@
use futures::{stream::FuturesOrdered, StreamExt}; use futures::{stream::FuturesOrdered, StreamExt};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use reqwest::Client;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
const BASE_URL: &str = "https://lofigirl.com/wp-content/uploads/"; use crate::scrapers::{get, Source};
lazy_static! { lazy_static! {
static ref SELECTOR: Selector = Selector::parse("html > body > pre > a").unwrap(); static ref SELECTOR: Selector = Selector::parse("html > body > pre > a").unwrap();
} }
async fn parse(path: &str) -> eyre::Result<Vec<String>> { async fn parse(client: &Client, path: &str) -> eyre::Result<Vec<String>> {
let response = reqwest::get(format!("{}{}", BASE_URL, path)).await?; let document = get(client, path, super::Source::Lofigirl).await?;
let document = response.text().await?;
let html = Html::parse_document(&document); let html = Html::parse_document(&document);
Ok(html Ok(html
.select(&SELECTOR) .select(&SELECTOR)
.skip(5) .skip(5)
@ -30,9 +30,10 @@ async fn parse(path: &str) -> eyre::Result<Vec<String>> {
/// It's a bit hacky, and basically works by checking all of the years, then months, and then all of the files. /// It's a bit hacky, and basically works by checking all of the years, then months, and then all of the files.
/// This is done as a way to avoid recursion, since async rust really hates recursive functions. /// This is done as a way to avoid recursion, since async rust really hates recursive functions.
async fn scan(extension: &str, include_full: bool) -> eyre::Result<Vec<String>> { async fn scan(extension: &str, include_full: bool) -> eyre::Result<Vec<String>> {
let client = Client::new();
let extension = &format!(".{}", extension); let extension = &format!(".{}", extension);
let items = parse("").await?; let items = parse(&client, "/").await?;
let mut years: Vec<u32> = items let mut years: Vec<u32> = items
.iter() .iter()
@ -48,19 +49,20 @@ async fn scan(extension: &str, include_full: bool) -> eyre::Result<Vec<String>>
let mut futures = FuturesOrdered::new(); let mut futures = FuturesOrdered::new();
for year in years { for year in years {
let months = parse(&year.to_string()).await?; let months = parse(&client, &year.to_string()).await?;
for month in months { for month in months {
let client = client.clone();
futures.push_back(async move { futures.push_back(async move {
let path = format!("{}/{}", year, month); let path = format!("{}/{}", year, month);
let items = parse(&path).await.unwrap(); let items = parse(&client, &path).await.unwrap();
items items
.into_iter() .into_iter()
.filter_map(|x| { .filter_map(|x| {
if x.ends_with(extension) { if x.ends_with(extension) {
if include_full { if include_full {
Some(format!("{BASE_URL}{path}{x}")) Some(format!("{}/{path}{x}", Source::Lofigirl.url()))
} else { } else {
Some(format!("{path}{x}")) Some(format!("{path}{x}"))
} }