mirror of
https://github.com/talwat/lowfi
synced 2025-08-16 06:32:14 +00:00
feat: make scraper fully reproducable (hopefully)
This commit is contained in:
parent
1052b5e782
commit
7d486b08f5
@ -614,6 +614,19 @@ https://stream.chillhop.com/mp3/
|
||||
9225!Tumbling
|
||||
9224!Fox
|
||||
8037!Cliff Of The Horizon
|
||||
8261!Drifter
|
||||
8985!Mozambique
|
||||
8707!autumn breeze
|
||||
8080!YesPlease
|
||||
8391!Chasin Daisys
|
||||
7922!A Tribe Called Tenz
|
||||
7840!Sunsets
|
||||
8827!Changing Lanes
|
||||
7833!Sunlit
|
||||
8117!back when it all made sense
|
||||
9003!jam session
|
||||
7914!London Love Letters
|
||||
8405!Chrysalism
|
||||
9339!Highland
|
||||
9237!Waterfall Eyes
|
||||
9010!Origin
|
||||
@ -752,7 +765,6 @@ https://stream.chillhop.com/mp3/
|
||||
9783!After All
|
||||
9284!Anywhere But Here
|
||||
8072!comfortable
|
||||
7914!London Love Letters
|
||||
7929!Wrong Way
|
||||
7955!Cozy Fire
|
||||
10320!Ocean View
|
||||
@ -1228,14 +1240,6 @@ https://stream.chillhop.com/mp3/
|
||||
60840!Suzuki
|
||||
60842!Mahogany
|
||||
65379!Let Go (Philanthrope Remix)
|
||||
64098!Satin
|
||||
64097!Setbacks
|
||||
64096!Loner
|
||||
64095!Coast
|
||||
64094!Back Again
|
||||
64093!No Hassle
|
||||
64092!Bloom
|
||||
64091!Roam
|
||||
64125!Fade out
|
||||
64124!Things Fall Apart Pt.2
|
||||
64123!Serenade
|
||||
@ -1244,6 +1248,14 @@ https://stream.chillhop.com/mp3/
|
||||
64119!Move Like That
|
||||
64118!Panda
|
||||
64117!Beavis pt.2
|
||||
64098!Satin
|
||||
64097!Setbacks
|
||||
64096!Loner
|
||||
64095!Coast
|
||||
64094!Back Again
|
||||
64093!No Hassle
|
||||
64092!Bloom
|
||||
64091!Roam
|
||||
65489!Inhale/Ad Astra (Boukas Remix)
|
||||
68327!Naturally Flavored
|
||||
68324!Forward Movement
|
||||
@ -1312,15 +1324,8 @@ https://stream.chillhop.com/mp3/
|
||||
64054!I Don't Want Love
|
||||
64040!One for Florian
|
||||
75541!Seu Trio
|
||||
64045!Cut Free
|
||||
75544!You Bring Me Life
|
||||
64036!Curtain Call
|
||||
64038!High Hope
|
||||
64043!In the Sun
|
||||
64047!Sleeping Norboo
|
||||
64050!Autumn Turned Winter
|
||||
64052!Light of World
|
||||
64056!Harp Trees
|
||||
64045!Cut Free
|
||||
75547!Last One
|
||||
75546!That Summer
|
||||
75545!Hold it Down
|
||||
@ -1328,6 +1333,13 @@ https://stream.chillhop.com/mp3/
|
||||
75542!Hope
|
||||
75540!When All I Heard Was Artifacts
|
||||
75539!Cantar
|
||||
64036!Curtain Call
|
||||
64038!High Hope
|
||||
64043!In the Sun
|
||||
64047!Sleeping Norboo
|
||||
64050!Autumn Turned Winter
|
||||
64052!Light of World
|
||||
64056!Harp Trees
|
||||
79272!Early June
|
||||
74856!Light of World
|
||||
77527!Guitar Shop
|
||||
|
@ -1,4 +1,5 @@
|
||||
use eyre::{bail, eyre};
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{stream::FuturesOrdered, StreamExt};
|
||||
use indicatif::ProgressBar;
|
||||
use lazy_static::lazy_static;
|
||||
@ -45,14 +46,14 @@ impl Track {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct Release {
|
||||
#[serde(skip)]
|
||||
pub path: String,
|
||||
|
||||
#[serde(skip)]
|
||||
pub name: String,
|
||||
pub index: usize,
|
||||
|
||||
pub tracks: Vec<Track>,
|
||||
}
|
||||
|
||||
@ -68,16 +69,21 @@ enum ReleaseError {
|
||||
impl Release {
|
||||
pub async fn scan(
|
||||
path: String,
|
||||
index: usize,
|
||||
client: Client,
|
||||
bar: ProgressBar,
|
||||
) -> Result<Self, ReleaseError> {
|
||||
let content = get(&client, &path).await?;
|
||||
let html = Html::parse_document(&content);
|
||||
|
||||
let author = html
|
||||
.select(&RELEASE_AUTHOR)
|
||||
.next()
|
||||
.ok_or(eyre!("unable to find author: {path}"))?;
|
||||
let author = html.select(&RELEASE_AUTHOR).next();
|
||||
|
||||
if let Some(author) = author {
|
||||
if author.inner_html() == "Kenji" {
|
||||
// No lyrics!
|
||||
return Err(ReleaseError::Ignored);
|
||||
}
|
||||
}
|
||||
|
||||
let textarea = html
|
||||
.select(&RELEASE_TEXTAREA)
|
||||
@ -86,12 +92,9 @@ impl Release {
|
||||
|
||||
let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap();
|
||||
release.path = path;
|
||||
release.index = index;
|
||||
release.tracks.reverse();
|
||||
|
||||
if author.inner_html() == "Kenji" {
|
||||
return Err(ReleaseError::Ignored);
|
||||
}
|
||||
|
||||
bar.inc(release.tracks.len() as u64);
|
||||
|
||||
Ok(release)
|
||||
@ -156,7 +159,8 @@ async fn scan_page(
|
||||
|
||||
let elements = html.select(&RELEASES);
|
||||
Ok(elements
|
||||
.filter_map(|x| {
|
||||
.enumerate()
|
||||
.filter_map(|(i, x)| {
|
||||
let label = x.select(&RELEASE_LABEL).next()?.inner_html();
|
||||
if label == "Compilation" {
|
||||
return None;
|
||||
@ -164,6 +168,7 @@ async fn scan_page(
|
||||
|
||||
Some(Release::scan(
|
||||
x.attr("href")?.to_string(),
|
||||
(number * 12) + i,
|
||||
client.clone(),
|
||||
bar.clone(),
|
||||
))
|
||||
@ -174,26 +179,38 @@ async fn scan_page(
|
||||
pub async fn scrape() -> eyre::Result<()> {
|
||||
const PAGE_COUNT: usize = 40;
|
||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
|
||||
const TRACK_COUNT: u64 = 1500;
|
||||
const TRACK_COUNT: u64 = 1600;
|
||||
|
||||
fs::create_dir_all("./cache/chillhop").await.unwrap();
|
||||
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
||||
|
||||
let mut futures = FuturesOrdered::new();
|
||||
let bar = ProgressBar::new(TRACK_COUNT);
|
||||
let mut futures = FuturesUnordered::new();
|
||||
let bar = ProgressBar::new(TRACK_COUNT + 12 * (PAGE_COUNT as u64));
|
||||
|
||||
let mut errors = Vec::new();
|
||||
|
||||
// This is slightly less memory efficient than I'd hope, but it is what it is.
|
||||
for page in 0..=PAGE_COUNT {
|
||||
bar.inc(12);
|
||||
for x in scan_page(page, &client, bar.clone()).await? {
|
||||
futures.push_front(x);
|
||||
futures.push(x);
|
||||
}
|
||||
}
|
||||
|
||||
let mut printed = Vec::with_capacity(1500);
|
||||
while let Some(result) = futures.next().await {
|
||||
let mut results: Vec<Result<Release, ReleaseError>> = futures.collect().await;
|
||||
bar.finish_and_clear();
|
||||
|
||||
eprintln!("sorting...");
|
||||
results.sort_by_key(|x| if let Ok(x) = x { x.index } else { 0 });
|
||||
results.reverse();
|
||||
|
||||
eprintln!("printing...");
|
||||
let mut printed = Vec::with_capacity(TRACK_COUNT as usize);
|
||||
for result in results {
|
||||
let release = match result {
|
||||
Ok(release) => release,
|
||||
Err(_) => {
|
||||
Err(error) => {
|
||||
errors.push(error);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
@ -214,7 +231,14 @@ pub async fn scrape() -> eyre::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
bar.finish();
|
||||
eprintln!("-- ERROR REPORT --");
|
||||
for error in errors {
|
||||
if matches!(error, ReleaseError::Ignored) {
|
||||
continue;
|
||||
}
|
||||
|
||||
eprintln!("{error}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user