feat: finish chillhop scraper

This commit is contained in:
Tal 2025-08-07 20:08:49 +02:00
parent 3f55768754
commit 91bb61dd92
6 changed files with 181 additions and 25 deletions

7
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,7 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": []
}

30
Cargo.lock generated
View File

@ -1016,6 +1016,15 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "html-escape"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
dependencies = [
"utf8-width",
]
[[package]]
name = "html5ever"
version = "0.29.0"
@ -1410,6 +1419,7 @@ dependencies = [
"dirs",
"eyre",
"futures",
"html-escape",
"lazy_static",
"libc",
"mpris-server",
@ -1417,6 +1427,8 @@ dependencies = [
"reqwest",
"rodio",
"scraper",
"serde",
"serde_json",
"thiserror 2.0.12",
"tokio",
"unicode-segmentation",
@ -2293,18 +2305,18 @@ dependencies = [
[[package]]
name = "serde"
version = "1.0.217"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.217"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
dependencies = [
"proc-macro2",
"quote",
@ -2313,9 +2325,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.135"
version = "1.0.142"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9"
checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7"
dependencies = [
"itoa",
"memchr",
@ -2985,6 +2997,12 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8-width"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
[[package]]
name = "utf8_iter"
version = "1.0.4"

View File

@ -54,3 +54,6 @@ lazy_static = "1.5.0"
libc = "0.2.167"
url = "2.5.4"
unicode-segmentation = "1.12.0"
serde = { version = "1.0.219", features = ["derive"] }
serde_json = "1.0.142"
html-escape = "0.2.13"

4
scripts/fix_cache.sh Normal file
View File

@ -0,0 +1,4 @@
#!/bin/sh
grep -rlZ "429 Too Many Requests" . | xargs -0 rm -f
find . -type f -empty -delete

View File

@ -104,7 +104,7 @@ async fn main() -> eyre::Result<()> {
include_full,
} => match source {
Sources::Lofigirl => scrapers::lofigirl::scrape(extension, include_full).await?,
Sources::Chillhop => scrapers::chillhop::scrape().await,
Sources::Chillhop => scrapers::chillhop::scrape().await?,
},
}
} else {

View File

@ -1,53 +1,177 @@
use eyre::{bail, eyre};
use futures::{stream::FuturesOrdered, StreamExt};
use lazy_static::lazy_static;
use std::path::{Path, PathBuf};
use reqwest::Client;
use scraper::{Html, Selector};
use serde::Deserialize;
use tokio::{
fs::{self, File},
io::AsyncWriteExt,
};
struct Release {
pub tracks: Vec<String>,
pub author: String,
lazy_static! {
static ref RELEASES: Selector = Selector::parse(".table-body > a").unwrap();
static ref RELEASE_LABEL: Selector = Selector::parse("label").unwrap();
// static ref RELEASE_DATE: Selector = Selector::parse(".release-feat-props > .text-xs").unwrap();
// static ref RELEASE_NAME: Selector = Selector::parse(".release-feat-props > h2").unwrap();
static ref RELEASE_AUTHOR: Selector = Selector::parse(".release-feat-props .artist-link").unwrap();
static ref RELEASE_TEXTAREA: Selector = Selector::parse("textarea").unwrap();
}
struct Data {
pub releases: Vec<Release>,
#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Track {
title: String,
file_id: String,
artists: String,
}
#[derive(Deserialize, Debug)]
struct Release {
#[serde(skip)]
pub path: String,
#[serde(skip)]
pub name: String,
pub tracks: Vec<Track>,
}
#[derive(thiserror::Error, Debug)]
enum ReleaseError {
#[error("invalid track: {0}")]
Invalid(#[from] eyre::Error),
#[error("track explicitly ignored")]
Ignored,
}
impl Release {
pub async fn scan(path: String, client: Client) -> Result<Self, ReleaseError> {
let content = get(&client, &path).await?;
let html = Html::parse_document(&content);
let textarea = html
.select(&RELEASE_TEXTAREA)
.next()
.ok_or(eyre!("unable to find textarea: {path}"))?;
let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap();
release.tracks.reverse();
let author = html
.select(&RELEASE_AUTHOR)
.next()
.ok_or(eyre!("unable to find author: {path}"))?;
if author.inner_html() == "Kenji" {
return Err(ReleaseError::Ignored);
}
Ok(release)
}
}
/// Sends a get request, with caching.
async fn get(client: &Client, path: &str) -> String {
let cache = PathBuf::from(format!("./cache/chillhop/{path}.html"));
async fn get(client: &Client, path: &str) -> eyre::Result<String> {
let trimmed = path.trim_matches('/');
let cache = PathBuf::from(format!("./cache/chillhop/{trimmed}.html"));
if let Ok(x) = fs::read_to_string(&cache).await {
x
Ok(x)
} else {
let resp = client
.get(format!("https://chillhop.com/{path}"))
.get(format!("https://chillhop.com/{trimmed}"))
.send()
.await
.unwrap();
let text = resp.text().await.unwrap();
.await?;
let status = resp.status();
if status == 429 {
bail!("rate limit reached: {path}");
}
if status != 404 && !status.is_success() && !status.is_redirection() {
bail!("non success code {}: {path}", resp.status().as_u16());
}
let text = resp.text().await?;
let parent = cache.parent();
if let Some(x) = parent {
if x != Path::new("") {
fs::create_dir_all(x).await.unwrap();
fs::create_dir_all(x).await?;
}
}
let mut file = File::create(&cache).await.unwrap();
file.write_all(text.as_bytes()).await.unwrap();
let mut file = File::create(&cache).await?;
file.write_all(text.as_bytes()).await?;
text
if status.is_redirection() {
bail!("redirect: {path}")
}
if status == 404 {
bail!("not found: {path}")
}
Ok(text)
}
}
pub async fn scrape() {
async fn scan_page(
number: usize,
client: &Client,
) -> eyre::Result<Vec<impl futures::Future<Output = Result<Release, ReleaseError>>>> {
let path = format!("releases/?page={number}");
let content = get(client, &path).await?;
let html = Html::parse_document(&content);
let elements = html.select(&RELEASES);
Ok(elements
.filter_map(|x| {
let label = x.select(&RELEASE_LABEL).next()?.inner_html();
if label == "Compilation" || label == "Mix" {
return None;
}
Some(Release::scan(x.attr("href")?.to_string(), client.clone()))
})
.collect())
}
pub async fn scrape() -> eyre::Result<()> {
const PAGE_COUNT: usize = 40;
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
fs::create_dir_all("./cache/chillhop").await.unwrap();
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
get(&client, "releases/?page=30").await;
let mut futures = FuturesOrdered::new();
// This is slightly less memory efficient than I'd hope, but it is what it is.
for page in 0..=PAGE_COUNT {
for x in scan_page(page, &client).await? {
futures.push_front(x);
}
}
while let Some(result) = futures.next().await {
let release = match result {
Ok(release) => release,
Err(error) => {
eprintln!("error: {}, skipping", error);
continue;
}
};
for track in release.tracks {
let title = html_escape::decode_html_entities(&track.title);
let artist = html_escape::decode_html_entities(
track.artists.split(", ").next().unwrap_or(&track.artists),
);
println!("{}!{artist} - {title}", track.file_id)
}
}
Ok(())
}