feat: make scraper fully reproducable (hopefully)

This commit is contained in:
talwat 2025-08-08 20:57:59 +02:00
parent 1052b5e782
commit 7d486b08f5
2 changed files with 72 additions and 36 deletions

View File

@ -614,6 +614,19 @@ https://stream.chillhop.com/mp3/
9225!Tumbling
9224!Fox
8037!Cliff Of The Horizon
8261!Drifter
8985!Mozambique
8707!autumn breeze
8080!YesPlease
8391!Chasin Daisys
7922!A Tribe Called Tenz
7840!Sunsets
8827!Changing Lanes
7833!Sunlit
8117!back when it all made sense
9003!jam session
7914!London Love Letters
8405!Chrysalism
9339!Highland
9237!Waterfall Eyes
9010!Origin
@ -752,7 +765,6 @@ https://stream.chillhop.com/mp3/
9783!After All
9284!Anywhere But Here
8072!comfortable
7914!London Love Letters
7929!Wrong Way
7955!Cozy Fire
10320!Ocean View
@ -1228,14 +1240,6 @@ https://stream.chillhop.com/mp3/
60840!Suzuki
60842!Mahogany
65379!Let Go (Philanthrope Remix)
64098!Satin
64097!Setbacks
64096!Loner
64095!Coast
64094!Back Again
64093!No Hassle
64092!Bloom
64091!Roam
64125!Fade out
64124!Things Fall Apart Pt.2
64123!Serenade
@ -1244,6 +1248,14 @@ https://stream.chillhop.com/mp3/
64119!Move Like That
64118!Panda
64117!Beavis pt.2
64098!Satin
64097!Setbacks
64096!Loner
64095!Coast
64094!Back Again
64093!No Hassle
64092!Bloom
64091!Roam
65489!Inhale/Ad Astra (Boukas Remix)
68327!Naturally Flavored
68324!Forward Movement
@ -1312,15 +1324,8 @@ https://stream.chillhop.com/mp3/
64054!I Don't Want Love
64040!One for Florian
75541!Seu Trio
64045!Cut Free
75544!You Bring Me Life
64036!Curtain Call
64038!High Hope
64043!In the Sun
64047!Sleeping Norboo
64050!Autumn Turned Winter
64052!Light of World
64056!Harp Trees
64045!Cut Free
75547!Last One
75546!That Summer
75545!Hold it Down
@ -1328,6 +1333,13 @@ https://stream.chillhop.com/mp3/
75542!Hope
75540!When All I Heard Was Artifacts
75539!Cantar
64036!Curtain Call
64038!High Hope
64043!In the Sun
64047!Sleeping Norboo
64050!Autumn Turned Winter
64052!Light of World
64056!Harp Trees
79272!Early June
74856!Light of World
77527!Guitar Shop

View File

@ -1,4 +1,5 @@
use eyre::{bail, eyre};
use futures::stream::FuturesUnordered;
use futures::{stream::FuturesOrdered, StreamExt};
use indicatif::ProgressBar;
use lazy_static::lazy_static;
@ -45,14 +46,14 @@ impl Track {
}
}
#[allow(dead_code)]
#[derive(Deserialize, Debug)]
struct Release {
#[serde(skip)]
pub path: String,
#[serde(skip)]
pub name: String,
pub index: usize,
pub tracks: Vec<Track>,
}
@ -68,16 +69,21 @@ enum ReleaseError {
impl Release {
pub async fn scan(
path: String,
index: usize,
client: Client,
bar: ProgressBar,
) -> Result<Self, ReleaseError> {
let content = get(&client, &path).await?;
let html = Html::parse_document(&content);
let author = html
.select(&RELEASE_AUTHOR)
.next()
.ok_or(eyre!("unable to find author: {path}"))?;
let author = html.select(&RELEASE_AUTHOR).next();
if let Some(author) = author {
if author.inner_html() == "Kenji" {
// No lyrics!
return Err(ReleaseError::Ignored);
}
}
let textarea = html
.select(&RELEASE_TEXTAREA)
@ -86,12 +92,9 @@ impl Release {
let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap();
release.path = path;
release.index = index;
release.tracks.reverse();
if author.inner_html() == "Kenji" {
return Err(ReleaseError::Ignored);
}
bar.inc(release.tracks.len() as u64);
Ok(release)
@ -156,7 +159,8 @@ async fn scan_page(
let elements = html.select(&RELEASES);
Ok(elements
.filter_map(|x| {
.enumerate()
.filter_map(|(i, x)| {
let label = x.select(&RELEASE_LABEL).next()?.inner_html();
if label == "Compilation" {
return None;
@ -164,6 +168,7 @@ async fn scan_page(
Some(Release::scan(
x.attr("href")?.to_string(),
(number * 12) + i,
client.clone(),
bar.clone(),
))
@ -174,26 +179,38 @@ async fn scan_page(
pub async fn scrape() -> eyre::Result<()> {
const PAGE_COUNT: usize = 40;
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
const TRACK_COUNT: u64 = 1500;
const TRACK_COUNT: u64 = 1600;
fs::create_dir_all("./cache/chillhop").await.unwrap();
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let mut futures = FuturesOrdered::new();
let bar = ProgressBar::new(TRACK_COUNT);
let mut futures = FuturesUnordered::new();
let bar = ProgressBar::new(TRACK_COUNT + 12 * (PAGE_COUNT as u64));
let mut errors = Vec::new();
// This is slightly less memory efficient than I'd hope, but it is what it is.
for page in 0..=PAGE_COUNT {
bar.inc(12);
for x in scan_page(page, &client, bar.clone()).await? {
futures.push_front(x);
futures.push(x);
}
}
let mut printed = Vec::with_capacity(1500);
while let Some(result) = futures.next().await {
let mut results: Vec<Result<Release, ReleaseError>> = futures.collect().await;
bar.finish_and_clear();
eprintln!("sorting...");
results.sort_by_key(|x| if let Ok(x) = x { x.index } else { 0 });
results.reverse();
eprintln!("printing...");
let mut printed = Vec::with_capacity(TRACK_COUNT as usize);
for result in results {
let release = match result {
Ok(release) => release,
Err(_) => {
Err(error) => {
errors.push(error);
continue;
}
};
@ -214,7 +231,14 @@ pub async fn scrape() -> eyre::Result<()> {
}
}
bar.finish();
eprintln!("-- ERROR REPORT --");
for error in errors {
if matches!(error, ReleaseError::Ignored) {
continue;
}
eprintln!("{error}");
}
Ok(())
}