diff --git a/data/chillhop-new.txt b/data/chillhop-new.txt index aaba419..b704b2d 100644 --- a/data/chillhop-new.txt +++ b/data/chillhop-new.txt @@ -614,6 +614,19 @@ https://stream.chillhop.com/mp3/ 9225!Tumbling 9224!Fox 8037!Cliff Of The Horizon +8261!Drifter +8985!Mozambique +8707!autumn breeze +8080!YesPlease +8391!Chasin Daisys +7922!A Tribe Called Tenz +7840!Sunsets +8827!Changing Lanes +7833!Sunlit +8117!back when it all made sense +9003!jam session +7914!London Love Letters +8405!Chrysalism 9339!Highland 9237!Waterfall Eyes 9010!Origin @@ -752,7 +765,6 @@ https://stream.chillhop.com/mp3/ 9783!After All 9284!Anywhere But Here 8072!comfortable -7914!London Love Letters 7929!Wrong Way 7955!Cozy Fire 10320!Ocean View @@ -1228,14 +1240,6 @@ https://stream.chillhop.com/mp3/ 60840!Suzuki 60842!Mahogany 65379!Let Go (Philanthrope Remix) -64098!Satin -64097!Setbacks -64096!Loner -64095!Coast -64094!Back Again -64093!No Hassle -64092!Bloom -64091!Roam 64125!Fade out 64124!Things Fall Apart Pt.2 64123!Serenade @@ -1244,6 +1248,14 @@ https://stream.chillhop.com/mp3/ 64119!Move Like That 64118!Panda 64117!Beavis pt.2 +64098!Satin +64097!Setbacks +64096!Loner +64095!Coast +64094!Back Again +64093!No Hassle +64092!Bloom +64091!Roam 65489!Inhale/Ad Astra (Boukas Remix) 68327!Naturally Flavored 68324!Forward Movement @@ -1312,15 +1324,8 @@ https://stream.chillhop.com/mp3/ 64054!I Don't Want Love 64040!One for Florian 75541!Seu Trio -64045!Cut Free 75544!You Bring Me Life -64036!Curtain Call -64038!High Hope -64043!In the Sun -64047!Sleeping Norboo -64050!Autumn Turned Winter -64052!Light of World -64056!Harp Trees +64045!Cut Free 75547!Last One 75546!That Summer 75545!Hold it Down @@ -1328,6 +1333,13 @@ https://stream.chillhop.com/mp3/ 75542!Hope 75540!When All I Heard Was Artifacts 75539!Cantar +64036!Curtain Call +64038!High Hope +64043!In the Sun +64047!Sleeping Norboo +64050!Autumn Turned Winter +64052!Light of World +64056!Harp Trees 79272!Early June 74856!Light of World 77527!Guitar Shop diff --git a/src/scrapers/chillhop.rs b/src/scrapers/chillhop.rs index f7bee0b..8ba4754 100644 --- a/src/scrapers/chillhop.rs +++ b/src/scrapers/chillhop.rs @@ -1,4 +1,5 @@ use eyre::{bail, eyre}; +use futures::stream::FuturesUnordered; use futures::{stream::FuturesOrdered, StreamExt}; use indicatif::ProgressBar; use lazy_static::lazy_static; @@ -45,14 +46,14 @@ impl Track { } } -#[allow(dead_code)] #[derive(Deserialize, Debug)] struct Release { #[serde(skip)] pub path: String, #[serde(skip)] - pub name: String, + pub index: usize, + pub tracks: Vec, } @@ -68,16 +69,21 @@ enum ReleaseError { impl Release { pub async fn scan( path: String, + index: usize, client: Client, bar: ProgressBar, ) -> Result { let content = get(&client, &path).await?; let html = Html::parse_document(&content); - let author = html - .select(&RELEASE_AUTHOR) - .next() - .ok_or(eyre!("unable to find author: {path}"))?; + let author = html.select(&RELEASE_AUTHOR).next(); + + if let Some(author) = author { + if author.inner_html() == "Kenji" { + // No lyrics! + return Err(ReleaseError::Ignored); + } + } let textarea = html .select(&RELEASE_TEXTAREA) @@ -86,12 +92,9 @@ impl Release { let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap(); release.path = path; + release.index = index; release.tracks.reverse(); - if author.inner_html() == "Kenji" { - return Err(ReleaseError::Ignored); - } - bar.inc(release.tracks.len() as u64); Ok(release) @@ -156,7 +159,8 @@ async fn scan_page( let elements = html.select(&RELEASES); Ok(elements - .filter_map(|x| { + .enumerate() + .filter_map(|(i, x)| { let label = x.select(&RELEASE_LABEL).next()?.inner_html(); if label == "Compilation" { return None; @@ -164,6 +168,7 @@ async fn scan_page( Some(Release::scan( x.attr("href")?.to_string(), + (number * 12) + i, client.clone(), bar.clone(), )) @@ -174,26 +179,38 @@ async fn scan_page( pub async fn scrape() -> eyre::Result<()> { const PAGE_COUNT: usize = 40; const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"; - const TRACK_COUNT: u64 = 1500; + const TRACK_COUNT: u64 = 1600; fs::create_dir_all("./cache/chillhop").await.unwrap(); let client = Client::builder().user_agent(USER_AGENT).build().unwrap(); - let mut futures = FuturesOrdered::new(); - let bar = ProgressBar::new(TRACK_COUNT); + let mut futures = FuturesUnordered::new(); + let bar = ProgressBar::new(TRACK_COUNT + 12 * (PAGE_COUNT as u64)); + + let mut errors = Vec::new(); // This is slightly less memory efficient than I'd hope, but it is what it is. for page in 0..=PAGE_COUNT { + bar.inc(12); for x in scan_page(page, &client, bar.clone()).await? { - futures.push_front(x); + futures.push(x); } } - let mut printed = Vec::with_capacity(1500); - while let Some(result) = futures.next().await { + let mut results: Vec> = futures.collect().await; + bar.finish_and_clear(); + + eprintln!("sorting..."); + results.sort_by_key(|x| if let Ok(x) = x { x.index } else { 0 }); + results.reverse(); + + eprintln!("printing..."); + let mut printed = Vec::with_capacity(TRACK_COUNT as usize); + for result in results { let release = match result { Ok(release) => release, - Err(_) => { + Err(error) => { + errors.push(error); continue; } }; @@ -214,7 +231,14 @@ pub async fn scrape() -> eyre::Result<()> { } } - bar.finish(); + eprintln!("-- ERROR REPORT --"); + for error in errors { + if matches!(error, ReleaseError::Ignored) { + continue; + } + + eprintln!("{error}"); + } Ok(()) }