diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..5c7247b --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,7 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [] +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index fd93975..0c3baf0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1016,6 +1016,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "html-escape" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476" +dependencies = [ + "utf8-width", +] + [[package]] name = "html5ever" version = "0.29.0" @@ -1410,6 +1419,7 @@ dependencies = [ "dirs", "eyre", "futures", + "html-escape", "lazy_static", "libc", "mpris-server", @@ -1417,6 +1427,8 @@ dependencies = [ "reqwest", "rodio", "scraper", + "serde", + "serde_json", "thiserror 2.0.12", "tokio", "unicode-segmentation", @@ -2293,18 +2305,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.217" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.217" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", @@ -2313,9 +2325,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.135" +version = "1.0.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9" +checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7" dependencies = [ "itoa", "memchr", @@ -2985,6 +2997,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" +[[package]] +name = "utf8-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3" + [[package]] name = "utf8_iter" version = "1.0.4" diff --git a/Cargo.toml b/Cargo.toml index 2220be4..cf898dc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,3 +54,6 @@ lazy_static = "1.5.0" libc = "0.2.167" url = "2.5.4" unicode-segmentation = "1.12.0" +serde = { version = "1.0.219", features = ["derive"] } +serde_json = "1.0.142" +html-escape = "0.2.13" diff --git a/scripts/fix_cache.sh b/scripts/fix_cache.sh new file mode 100644 index 0000000..7f3bca6 --- /dev/null +++ b/scripts/fix_cache.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +grep -rlZ "429 Too Many Requests" . | xargs -0 rm -f +find . -type f -empty -delete \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 5af5687..fff875f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -104,7 +104,7 @@ async fn main() -> eyre::Result<()> { include_full, } => match source { Sources::Lofigirl => scrapers::lofigirl::scrape(extension, include_full).await?, - Sources::Chillhop => scrapers::chillhop::scrape().await, + Sources::Chillhop => scrapers::chillhop::scrape().await?, }, } } else { diff --git a/src/scrapers/chillhop.rs b/src/scrapers/chillhop.rs index 945674d..ae81602 100644 --- a/src/scrapers/chillhop.rs +++ b/src/scrapers/chillhop.rs @@ -1,53 +1,177 @@ +use eyre::{bail, eyre}; +use futures::{stream::FuturesOrdered, StreamExt}; +use lazy_static::lazy_static; use std::path::{Path, PathBuf}; use reqwest::Client; +use scraper::{Html, Selector}; +use serde::Deserialize; use tokio::{ fs::{self, File}, io::AsyncWriteExt, }; -struct Release { - pub tracks: Vec, - pub author: String, +lazy_static! { + static ref RELEASES: Selector = Selector::parse(".table-body > a").unwrap(); + static ref RELEASE_LABEL: Selector = Selector::parse("label").unwrap(); + // static ref RELEASE_DATE: Selector = Selector::parse(".release-feat-props > .text-xs").unwrap(); + // static ref RELEASE_NAME: Selector = Selector::parse(".release-feat-props > h2").unwrap(); + static ref RELEASE_AUTHOR: Selector = Selector::parse(".release-feat-props .artist-link").unwrap(); + static ref RELEASE_TEXTAREA: Selector = Selector::parse("textarea").unwrap(); } -struct Data { - pub releases: Vec, +#[derive(Deserialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct Track { + title: String, + file_id: String, + artists: String, +} + +#[derive(Deserialize, Debug)] +struct Release { + #[serde(skip)] + pub path: String, + #[serde(skip)] + pub name: String, + pub tracks: Vec, +} + +#[derive(thiserror::Error, Debug)] +enum ReleaseError { + #[error("invalid track: {0}")] + Invalid(#[from] eyre::Error), + + #[error("track explicitly ignored")] + Ignored, +} + +impl Release { + pub async fn scan(path: String, client: Client) -> Result { + let content = get(&client, &path).await?; + let html = Html::parse_document(&content); + + let textarea = html + .select(&RELEASE_TEXTAREA) + .next() + .ok_or(eyre!("unable to find textarea: {path}"))?; + let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap(); + release.tracks.reverse(); + + let author = html + .select(&RELEASE_AUTHOR) + .next() + .ok_or(eyre!("unable to find author: {path}"))?; + if author.inner_html() == "Kenji" { + return Err(ReleaseError::Ignored); + } + + Ok(release) + } } /// Sends a get request, with caching. -async fn get(client: &Client, path: &str) -> String { - let cache = PathBuf::from(format!("./cache/chillhop/{path}.html")); +async fn get(client: &Client, path: &str) -> eyre::Result { + let trimmed = path.trim_matches('/'); + let cache = PathBuf::from(format!("./cache/chillhop/{trimmed}.html")); + if let Ok(x) = fs::read_to_string(&cache).await { - x + Ok(x) } else { let resp = client - .get(format!("https://chillhop.com/{path}")) + .get(format!("https://chillhop.com/{trimmed}")) .send() - .await - .unwrap(); - let text = resp.text().await.unwrap(); + .await?; + + let status = resp.status(); + + if status == 429 { + bail!("rate limit reached: {path}"); + } + + if status != 404 && !status.is_success() && !status.is_redirection() { + bail!("non success code {}: {path}", resp.status().as_u16()); + } + + let text = resp.text().await?; let parent = cache.parent(); if let Some(x) = parent { if x != Path::new("") { - fs::create_dir_all(x).await.unwrap(); + fs::create_dir_all(x).await?; } } - let mut file = File::create(&cache).await.unwrap(); - file.write_all(text.as_bytes()).await.unwrap(); + let mut file = File::create(&cache).await?; + file.write_all(text.as_bytes()).await?; - text + if status.is_redirection() { + bail!("redirect: {path}") + } + + if status == 404 { + bail!("not found: {path}") + } + + Ok(text) } } -pub async fn scrape() { +async fn scan_page( + number: usize, + client: &Client, +) -> eyre::Result>>> { + let path = format!("releases/?page={number}"); + let content = get(client, &path).await?; + let html = Html::parse_document(&content); + + let elements = html.select(&RELEASES); + Ok(elements + .filter_map(|x| { + let label = x.select(&RELEASE_LABEL).next()?.inner_html(); + if label == "Compilation" || label == "Mix" { + return None; + } + + Some(Release::scan(x.attr("href")?.to_string(), client.clone())) + }) + .collect()) +} + +pub async fn scrape() -> eyre::Result<()> { const PAGE_COUNT: usize = 40; const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"; fs::create_dir_all("./cache/chillhop").await.unwrap(); let client = Client::builder().user_agent(USER_AGENT).build().unwrap(); - get(&client, "releases/?page=30").await; + let mut futures = FuturesOrdered::new(); + + // This is slightly less memory efficient than I'd hope, but it is what it is. + for page in 0..=PAGE_COUNT { + for x in scan_page(page, &client).await? { + futures.push_front(x); + } + } + + while let Some(result) = futures.next().await { + let release = match result { + Ok(release) => release, + Err(error) => { + eprintln!("error: {}, skipping", error); + continue; + } + }; + + for track in release.tracks { + let title = html_escape::decode_html_entities(&track.title); + let artist = html_escape::decode_html_entities( + track.artists.split(", ").next().unwrap_or(&track.artists), + ); + + println!("{}!{artist} - {title}", track.file_id) + } + } + + Ok(()) }