mirror of
https://github.com/talwat/lowfi
synced 2025-08-17 15:12:37 +00:00
feat: make scraper fully reproducable (hopefully)
This commit is contained in:
parent
1052b5e782
commit
7d486b08f5
@ -614,6 +614,19 @@ https://stream.chillhop.com/mp3/
|
|||||||
9225!Tumbling
|
9225!Tumbling
|
||||||
9224!Fox
|
9224!Fox
|
||||||
8037!Cliff Of The Horizon
|
8037!Cliff Of The Horizon
|
||||||
|
8261!Drifter
|
||||||
|
8985!Mozambique
|
||||||
|
8707!autumn breeze
|
||||||
|
8080!YesPlease
|
||||||
|
8391!Chasin Daisys
|
||||||
|
7922!A Tribe Called Tenz
|
||||||
|
7840!Sunsets
|
||||||
|
8827!Changing Lanes
|
||||||
|
7833!Sunlit
|
||||||
|
8117!back when it all made sense
|
||||||
|
9003!jam session
|
||||||
|
7914!London Love Letters
|
||||||
|
8405!Chrysalism
|
||||||
9339!Highland
|
9339!Highland
|
||||||
9237!Waterfall Eyes
|
9237!Waterfall Eyes
|
||||||
9010!Origin
|
9010!Origin
|
||||||
@ -752,7 +765,6 @@ https://stream.chillhop.com/mp3/
|
|||||||
9783!After All
|
9783!After All
|
||||||
9284!Anywhere But Here
|
9284!Anywhere But Here
|
||||||
8072!comfortable
|
8072!comfortable
|
||||||
7914!London Love Letters
|
|
||||||
7929!Wrong Way
|
7929!Wrong Way
|
||||||
7955!Cozy Fire
|
7955!Cozy Fire
|
||||||
10320!Ocean View
|
10320!Ocean View
|
||||||
@ -1228,14 +1240,6 @@ https://stream.chillhop.com/mp3/
|
|||||||
60840!Suzuki
|
60840!Suzuki
|
||||||
60842!Mahogany
|
60842!Mahogany
|
||||||
65379!Let Go (Philanthrope Remix)
|
65379!Let Go (Philanthrope Remix)
|
||||||
64098!Satin
|
|
||||||
64097!Setbacks
|
|
||||||
64096!Loner
|
|
||||||
64095!Coast
|
|
||||||
64094!Back Again
|
|
||||||
64093!No Hassle
|
|
||||||
64092!Bloom
|
|
||||||
64091!Roam
|
|
||||||
64125!Fade out
|
64125!Fade out
|
||||||
64124!Things Fall Apart Pt.2
|
64124!Things Fall Apart Pt.2
|
||||||
64123!Serenade
|
64123!Serenade
|
||||||
@ -1244,6 +1248,14 @@ https://stream.chillhop.com/mp3/
|
|||||||
64119!Move Like That
|
64119!Move Like That
|
||||||
64118!Panda
|
64118!Panda
|
||||||
64117!Beavis pt.2
|
64117!Beavis pt.2
|
||||||
|
64098!Satin
|
||||||
|
64097!Setbacks
|
||||||
|
64096!Loner
|
||||||
|
64095!Coast
|
||||||
|
64094!Back Again
|
||||||
|
64093!No Hassle
|
||||||
|
64092!Bloom
|
||||||
|
64091!Roam
|
||||||
65489!Inhale/Ad Astra (Boukas Remix)
|
65489!Inhale/Ad Astra (Boukas Remix)
|
||||||
68327!Naturally Flavored
|
68327!Naturally Flavored
|
||||||
68324!Forward Movement
|
68324!Forward Movement
|
||||||
@ -1312,15 +1324,8 @@ https://stream.chillhop.com/mp3/
|
|||||||
64054!I Don't Want Love
|
64054!I Don't Want Love
|
||||||
64040!One for Florian
|
64040!One for Florian
|
||||||
75541!Seu Trio
|
75541!Seu Trio
|
||||||
64045!Cut Free
|
|
||||||
75544!You Bring Me Life
|
75544!You Bring Me Life
|
||||||
64036!Curtain Call
|
64045!Cut Free
|
||||||
64038!High Hope
|
|
||||||
64043!In the Sun
|
|
||||||
64047!Sleeping Norboo
|
|
||||||
64050!Autumn Turned Winter
|
|
||||||
64052!Light of World
|
|
||||||
64056!Harp Trees
|
|
||||||
75547!Last One
|
75547!Last One
|
||||||
75546!That Summer
|
75546!That Summer
|
||||||
75545!Hold it Down
|
75545!Hold it Down
|
||||||
@ -1328,6 +1333,13 @@ https://stream.chillhop.com/mp3/
|
|||||||
75542!Hope
|
75542!Hope
|
||||||
75540!When All I Heard Was Artifacts
|
75540!When All I Heard Was Artifacts
|
||||||
75539!Cantar
|
75539!Cantar
|
||||||
|
64036!Curtain Call
|
||||||
|
64038!High Hope
|
||||||
|
64043!In the Sun
|
||||||
|
64047!Sleeping Norboo
|
||||||
|
64050!Autumn Turned Winter
|
||||||
|
64052!Light of World
|
||||||
|
64056!Harp Trees
|
||||||
79272!Early June
|
79272!Early June
|
||||||
74856!Light of World
|
74856!Light of World
|
||||||
77527!Guitar Shop
|
77527!Guitar Shop
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use eyre::{bail, eyre};
|
use eyre::{bail, eyre};
|
||||||
|
use futures::stream::FuturesUnordered;
|
||||||
use futures::{stream::FuturesOrdered, StreamExt};
|
use futures::{stream::FuturesOrdered, StreamExt};
|
||||||
use indicatif::ProgressBar;
|
use indicatif::ProgressBar;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
@ -45,14 +46,14 @@ impl Track {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
#[derive(Deserialize, Debug)]
|
#[derive(Deserialize, Debug)]
|
||||||
struct Release {
|
struct Release {
|
||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
pub path: String,
|
pub path: String,
|
||||||
|
|
||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
pub name: String,
|
pub index: usize,
|
||||||
|
|
||||||
pub tracks: Vec<Track>,
|
pub tracks: Vec<Track>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,16 +69,21 @@ enum ReleaseError {
|
|||||||
impl Release {
|
impl Release {
|
||||||
pub async fn scan(
|
pub async fn scan(
|
||||||
path: String,
|
path: String,
|
||||||
|
index: usize,
|
||||||
client: Client,
|
client: Client,
|
||||||
bar: ProgressBar,
|
bar: ProgressBar,
|
||||||
) -> Result<Self, ReleaseError> {
|
) -> Result<Self, ReleaseError> {
|
||||||
let content = get(&client, &path).await?;
|
let content = get(&client, &path).await?;
|
||||||
let html = Html::parse_document(&content);
|
let html = Html::parse_document(&content);
|
||||||
|
|
||||||
let author = html
|
let author = html.select(&RELEASE_AUTHOR).next();
|
||||||
.select(&RELEASE_AUTHOR)
|
|
||||||
.next()
|
if let Some(author) = author {
|
||||||
.ok_or(eyre!("unable to find author: {path}"))?;
|
if author.inner_html() == "Kenji" {
|
||||||
|
// No lyrics!
|
||||||
|
return Err(ReleaseError::Ignored);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let textarea = html
|
let textarea = html
|
||||||
.select(&RELEASE_TEXTAREA)
|
.select(&RELEASE_TEXTAREA)
|
||||||
@ -86,12 +92,9 @@ impl Release {
|
|||||||
|
|
||||||
let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap();
|
let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap();
|
||||||
release.path = path;
|
release.path = path;
|
||||||
|
release.index = index;
|
||||||
release.tracks.reverse();
|
release.tracks.reverse();
|
||||||
|
|
||||||
if author.inner_html() == "Kenji" {
|
|
||||||
return Err(ReleaseError::Ignored);
|
|
||||||
}
|
|
||||||
|
|
||||||
bar.inc(release.tracks.len() as u64);
|
bar.inc(release.tracks.len() as u64);
|
||||||
|
|
||||||
Ok(release)
|
Ok(release)
|
||||||
@ -156,7 +159,8 @@ async fn scan_page(
|
|||||||
|
|
||||||
let elements = html.select(&RELEASES);
|
let elements = html.select(&RELEASES);
|
||||||
Ok(elements
|
Ok(elements
|
||||||
.filter_map(|x| {
|
.enumerate()
|
||||||
|
.filter_map(|(i, x)| {
|
||||||
let label = x.select(&RELEASE_LABEL).next()?.inner_html();
|
let label = x.select(&RELEASE_LABEL).next()?.inner_html();
|
||||||
if label == "Compilation" {
|
if label == "Compilation" {
|
||||||
return None;
|
return None;
|
||||||
@ -164,6 +168,7 @@ async fn scan_page(
|
|||||||
|
|
||||||
Some(Release::scan(
|
Some(Release::scan(
|
||||||
x.attr("href")?.to_string(),
|
x.attr("href")?.to_string(),
|
||||||
|
(number * 12) + i,
|
||||||
client.clone(),
|
client.clone(),
|
||||||
bar.clone(),
|
bar.clone(),
|
||||||
))
|
))
|
||||||
@ -174,26 +179,38 @@ async fn scan_page(
|
|||||||
pub async fn scrape() -> eyre::Result<()> {
|
pub async fn scrape() -> eyre::Result<()> {
|
||||||
const PAGE_COUNT: usize = 40;
|
const PAGE_COUNT: usize = 40;
|
||||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
|
||||||
const TRACK_COUNT: u64 = 1500;
|
const TRACK_COUNT: u64 = 1600;
|
||||||
|
|
||||||
fs::create_dir_all("./cache/chillhop").await.unwrap();
|
fs::create_dir_all("./cache/chillhop").await.unwrap();
|
||||||
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
||||||
|
|
||||||
let mut futures = FuturesOrdered::new();
|
let mut futures = FuturesUnordered::new();
|
||||||
let bar = ProgressBar::new(TRACK_COUNT);
|
let bar = ProgressBar::new(TRACK_COUNT + 12 * (PAGE_COUNT as u64));
|
||||||
|
|
||||||
|
let mut errors = Vec::new();
|
||||||
|
|
||||||
// This is slightly less memory efficient than I'd hope, but it is what it is.
|
// This is slightly less memory efficient than I'd hope, but it is what it is.
|
||||||
for page in 0..=PAGE_COUNT {
|
for page in 0..=PAGE_COUNT {
|
||||||
|
bar.inc(12);
|
||||||
for x in scan_page(page, &client, bar.clone()).await? {
|
for x in scan_page(page, &client, bar.clone()).await? {
|
||||||
futures.push_front(x);
|
futures.push(x);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut printed = Vec::with_capacity(1500);
|
let mut results: Vec<Result<Release, ReleaseError>> = futures.collect().await;
|
||||||
while let Some(result) = futures.next().await {
|
bar.finish_and_clear();
|
||||||
|
|
||||||
|
eprintln!("sorting...");
|
||||||
|
results.sort_by_key(|x| if let Ok(x) = x { x.index } else { 0 });
|
||||||
|
results.reverse();
|
||||||
|
|
||||||
|
eprintln!("printing...");
|
||||||
|
let mut printed = Vec::with_capacity(TRACK_COUNT as usize);
|
||||||
|
for result in results {
|
||||||
let release = match result {
|
let release = match result {
|
||||||
Ok(release) => release,
|
Ok(release) => release,
|
||||||
Err(_) => {
|
Err(error) => {
|
||||||
|
errors.push(error);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -214,7 +231,14 @@ pub async fn scrape() -> eyre::Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bar.finish();
|
eprintln!("-- ERROR REPORT --");
|
||||||
|
for error in errors {
|
||||||
|
if matches!(error, ReleaseError::Ignored) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
eprintln!("{error}");
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user