feat: make scraper fully reproducable (hopefully)

This commit is contained in:
talwat 2025-08-08 20:57:59 +02:00
parent 1052b5e782
commit 7d486b08f5
2 changed files with 72 additions and 36 deletions

View File

@ -614,6 +614,19 @@ https://stream.chillhop.com/mp3/
9225!Tumbling 9225!Tumbling
9224!Fox 9224!Fox
8037!Cliff Of The Horizon 8037!Cliff Of The Horizon
8261!Drifter
8985!Mozambique
8707!autumn breeze
8080!YesPlease
8391!Chasin Daisys
7922!A Tribe Called Tenz
7840!Sunsets
8827!Changing Lanes
7833!Sunlit
8117!back when it all made sense
9003!jam session
7914!London Love Letters
8405!Chrysalism
9339!Highland 9339!Highland
9237!Waterfall Eyes 9237!Waterfall Eyes
9010!Origin 9010!Origin
@ -752,7 +765,6 @@ https://stream.chillhop.com/mp3/
9783!After All 9783!After All
9284!Anywhere But Here 9284!Anywhere But Here
8072!comfortable 8072!comfortable
7914!London Love Letters
7929!Wrong Way 7929!Wrong Way
7955!Cozy Fire 7955!Cozy Fire
10320!Ocean View 10320!Ocean View
@ -1228,14 +1240,6 @@ https://stream.chillhop.com/mp3/
60840!Suzuki 60840!Suzuki
60842!Mahogany 60842!Mahogany
65379!Let Go (Philanthrope Remix) 65379!Let Go (Philanthrope Remix)
64098!Satin
64097!Setbacks
64096!Loner
64095!Coast
64094!Back Again
64093!No Hassle
64092!Bloom
64091!Roam
64125!Fade out 64125!Fade out
64124!Things Fall Apart Pt.2 64124!Things Fall Apart Pt.2
64123!Serenade 64123!Serenade
@ -1244,6 +1248,14 @@ https://stream.chillhop.com/mp3/
64119!Move Like That 64119!Move Like That
64118!Panda 64118!Panda
64117!Beavis pt.2 64117!Beavis pt.2
64098!Satin
64097!Setbacks
64096!Loner
64095!Coast
64094!Back Again
64093!No Hassle
64092!Bloom
64091!Roam
65489!Inhale/Ad Astra (Boukas Remix) 65489!Inhale/Ad Astra (Boukas Remix)
68327!Naturally Flavored 68327!Naturally Flavored
68324!Forward Movement 68324!Forward Movement
@ -1312,15 +1324,8 @@ https://stream.chillhop.com/mp3/
64054!I Don't Want Love 64054!I Don't Want Love
64040!One for Florian 64040!One for Florian
75541!Seu Trio 75541!Seu Trio
64045!Cut Free
75544!You Bring Me Life 75544!You Bring Me Life
64036!Curtain Call 64045!Cut Free
64038!High Hope
64043!In the Sun
64047!Sleeping Norboo
64050!Autumn Turned Winter
64052!Light of World
64056!Harp Trees
75547!Last One 75547!Last One
75546!That Summer 75546!That Summer
75545!Hold it Down 75545!Hold it Down
@ -1328,6 +1333,13 @@ https://stream.chillhop.com/mp3/
75542!Hope 75542!Hope
75540!When All I Heard Was Artifacts 75540!When All I Heard Was Artifacts
75539!Cantar 75539!Cantar
64036!Curtain Call
64038!High Hope
64043!In the Sun
64047!Sleeping Norboo
64050!Autumn Turned Winter
64052!Light of World
64056!Harp Trees
79272!Early June 79272!Early June
74856!Light of World 74856!Light of World
77527!Guitar Shop 77527!Guitar Shop

View File

@ -1,4 +1,5 @@
use eyre::{bail, eyre}; use eyre::{bail, eyre};
use futures::stream::FuturesUnordered;
use futures::{stream::FuturesOrdered, StreamExt}; use futures::{stream::FuturesOrdered, StreamExt};
use indicatif::ProgressBar; use indicatif::ProgressBar;
use lazy_static::lazy_static; use lazy_static::lazy_static;
@ -45,14 +46,14 @@ impl Track {
} }
} }
#[allow(dead_code)]
#[derive(Deserialize, Debug)] #[derive(Deserialize, Debug)]
struct Release { struct Release {
#[serde(skip)] #[serde(skip)]
pub path: String, pub path: String,
#[serde(skip)] #[serde(skip)]
pub name: String, pub index: usize,
pub tracks: Vec<Track>, pub tracks: Vec<Track>,
} }
@ -68,16 +69,21 @@ enum ReleaseError {
impl Release { impl Release {
pub async fn scan( pub async fn scan(
path: String, path: String,
index: usize,
client: Client, client: Client,
bar: ProgressBar, bar: ProgressBar,
) -> Result<Self, ReleaseError> { ) -> Result<Self, ReleaseError> {
let content = get(&client, &path).await?; let content = get(&client, &path).await?;
let html = Html::parse_document(&content); let html = Html::parse_document(&content);
let author = html let author = html.select(&RELEASE_AUTHOR).next();
.select(&RELEASE_AUTHOR)
.next() if let Some(author) = author {
.ok_or(eyre!("unable to find author: {path}"))?; if author.inner_html() == "Kenji" {
// No lyrics!
return Err(ReleaseError::Ignored);
}
}
let textarea = html let textarea = html
.select(&RELEASE_TEXTAREA) .select(&RELEASE_TEXTAREA)
@ -86,12 +92,9 @@ impl Release {
let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap(); let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap();
release.path = path; release.path = path;
release.index = index;
release.tracks.reverse(); release.tracks.reverse();
if author.inner_html() == "Kenji" {
return Err(ReleaseError::Ignored);
}
bar.inc(release.tracks.len() as u64); bar.inc(release.tracks.len() as u64);
Ok(release) Ok(release)
@ -156,7 +159,8 @@ async fn scan_page(
let elements = html.select(&RELEASES); let elements = html.select(&RELEASES);
Ok(elements Ok(elements
.filter_map(|x| { .enumerate()
.filter_map(|(i, x)| {
let label = x.select(&RELEASE_LABEL).next()?.inner_html(); let label = x.select(&RELEASE_LABEL).next()?.inner_html();
if label == "Compilation" { if label == "Compilation" {
return None; return None;
@ -164,6 +168,7 @@ async fn scan_page(
Some(Release::scan( Some(Release::scan(
x.attr("href")?.to_string(), x.attr("href")?.to_string(),
(number * 12) + i,
client.clone(), client.clone(),
bar.clone(), bar.clone(),
)) ))
@ -174,26 +179,38 @@ async fn scan_page(
pub async fn scrape() -> eyre::Result<()> { pub async fn scrape() -> eyre::Result<()> {
const PAGE_COUNT: usize = 40; const PAGE_COUNT: usize = 40;
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"; const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
const TRACK_COUNT: u64 = 1500; const TRACK_COUNT: u64 = 1600;
fs::create_dir_all("./cache/chillhop").await.unwrap(); fs::create_dir_all("./cache/chillhop").await.unwrap();
let client = Client::builder().user_agent(USER_AGENT).build().unwrap(); let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let mut futures = FuturesOrdered::new(); let mut futures = FuturesUnordered::new();
let bar = ProgressBar::new(TRACK_COUNT); let bar = ProgressBar::new(TRACK_COUNT + 12 * (PAGE_COUNT as u64));
let mut errors = Vec::new();
// This is slightly less memory efficient than I'd hope, but it is what it is. // This is slightly less memory efficient than I'd hope, but it is what it is.
for page in 0..=PAGE_COUNT { for page in 0..=PAGE_COUNT {
bar.inc(12);
for x in scan_page(page, &client, bar.clone()).await? { for x in scan_page(page, &client, bar.clone()).await? {
futures.push_front(x); futures.push(x);
} }
} }
let mut printed = Vec::with_capacity(1500); let mut results: Vec<Result<Release, ReleaseError>> = futures.collect().await;
while let Some(result) = futures.next().await { bar.finish_and_clear();
eprintln!("sorting...");
results.sort_by_key(|x| if let Ok(x) = x { x.index } else { 0 });
results.reverse();
eprintln!("printing...");
let mut printed = Vec::with_capacity(TRACK_COUNT as usize);
for result in results {
let release = match result { let release = match result {
Ok(release) => release, Ok(release) => release,
Err(_) => { Err(error) => {
errors.push(error);
continue; continue;
} }
}; };
@ -214,7 +231,14 @@ pub async fn scrape() -> eyre::Result<()> {
} }
} }
bar.finish(); eprintln!("-- ERROR REPORT --");
for error in errors {
if matches!(error, ReleaseError::Ignored) {
continue;
}
eprintln!("{error}");
}
Ok(()) Ok(())
} }