lowfi/src/scrapers/chillhop.rs

224 lines
6.2 KiB
Rust

use eyre::eyre;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use indicatif::ProgressBar;
use lazy_static::lazy_static;
use std::fmt;
use std::str::FromStr;
use reqwest::Client;
use scraper::{Html, Selector};
use serde::{
de::{self, Visitor},
Deserialize, Deserializer,
};
use tokio::fs;
use crate::scrapers::{get, Source};
lazy_static! {
static ref RELEASES: Selector = Selector::parse(".table-body > a").unwrap();
static ref RELEASE_LABEL: Selector = Selector::parse("label").unwrap();
// static ref RELEASE_DATE: Selector = Selector::parse(".release-feat-props > .text-xs").unwrap();
// static ref RELEASE_NAME: Selector = Selector::parse(".release-feat-props > h2").unwrap();
static ref RELEASE_AUTHOR: Selector = Selector::parse(".release-feat-props .artist-link").unwrap();
static ref RELEASE_TEXTAREA: Selector = Selector::parse("textarea").unwrap();
}
#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Track {
title: String,
#[serde(deserialize_with = "deserialize_u32_from_string")]
file_id: u32,
artists: String,
}
impl Track {
pub fn clean(&mut self) {
self.artists = html_escape::decode_html_entities(&self.artists).to_string();
self.title = html_escape::decode_html_entities(&self.title).to_string();
}
}
#[derive(Deserialize, Debug)]
struct Release {
#[serde(skip)]
pub path: String,
#[serde(skip)]
pub index: usize,
pub tracks: Vec<Track>,
}
#[derive(thiserror::Error, Debug)]
enum ReleaseError {
#[error("invalid release: {0}")]
Invalid(#[from] eyre::Error),
}
impl Release {
pub async fn scan(
path: String,
index: usize,
client: Client,
bar: ProgressBar,
) -> Result<Self, ReleaseError> {
let content = get(&client, &path, Source::Chillhop).await?;
let html = Html::parse_document(&content);
let textarea = html
.select(&RELEASE_TEXTAREA)
.next()
.ok_or(eyre!("unable to find textarea: {path}"))?;
let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap();
release.path = path;
release.index = index;
release.tracks.reverse();
bar.inc(release.tracks.len() as u64);
Ok(release)
}
}
async fn scan_page(
number: usize,
client: &Client,
bar: ProgressBar,
) -> eyre::Result<Vec<impl futures::Future<Output = Result<Release, ReleaseError>>>> {
let path = format!("releases/?page={number}");
let content = get(client, &path, Source::Chillhop).await?;
let html = Html::parse_document(&content);
let elements = html.select(&RELEASES);
Ok(elements
.enumerate()
.filter_map(|(i, x)| {
let label = x.select(&RELEASE_LABEL).next()?.inner_html();
if label == "Compilation" {
return None;
}
Some(Release::scan(
x.attr("href")?.to_string(),
(number * 12) + i,
client.clone(),
bar.clone(),
))
})
.collect())
}
pub async fn scrape() -> eyre::Result<()> {
const PAGE_COUNT: usize = 40;
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
const TRACK_COUNT: u64 = 1625;
const IGNORED_TRACKS: [u32; 20] = [
// 404
74707, // Lyrics
21655, 21773, 8172, 55397, 75135, 24827, 8141, 8157, 64052, 31612, 41956, 8001, 9217,
55372, // Abnormal
8469, 7832, 10448, 9446, 9396,
];
const IGNORED_ARTISTS: [&str; 1] = [
"Kenji", // Lyrics
];
fs::create_dir_all("./cache/chillhop").await.unwrap();
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let futures = FuturesUnordered::new();
let bar = ProgressBar::new(TRACK_COUNT + (12 * (PAGE_COUNT as u64)));
let mut errors = Vec::new();
// This is slightly less memory efficient than I'd hope, but it is what it is.
for page in 0..=PAGE_COUNT {
bar.inc(12);
for x in scan_page(page, &client, bar.clone()).await? {
futures.push(x);
}
}
let mut results: Vec<Result<Release, ReleaseError>> = futures.collect().await;
bar.finish_and_clear();
// I mean, is it... optimal? Absolutely not. Does it work? Yes.
eprintln!("sorting...");
results.sort_by_key(|x| if let Ok(x) = x { x.index } else { 0 });
results.reverse();
eprintln!("printing...");
let mut printed = Vec::with_capacity(TRACK_COUNT as usize); // Lazy way to get rid of dupes.
for result in results {
let release = match result {
Ok(release) => release,
Err(error) => {
errors.push(error);
continue;
}
};
for mut track in release.tracks {
if IGNORED_TRACKS.contains(&track.file_id) {
continue;
}
if IGNORED_ARTISTS.contains(&track.artists.as_ref()) {
continue;
}
if printed.contains(&track.file_id) {
continue;
}
printed.push(track.file_id);
track.clean();
println!("{}!{}", track.file_id, track.title);
}
}
eprintln!("-- ERROR REPORT --");
for error in errors {
eprintln!("{error}");
}
Ok(())
}
pub fn deserialize_u32_from_string<'de, D>(deserializer: D) -> Result<u32, D::Error>
where
D: Deserializer<'de>,
{
struct U32FromStringVisitor;
impl<'de> Visitor<'de> for U32FromStringVisitor {
type Value = u32;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a string containing an unsigned 32-bit integer")
}
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
where
E: de::Error,
{
u32::from_str(value).map_err(|_| {
de::Error::invalid_value(
de::Unexpected::Str(value),
&"a valid unsigned 32-bit integer",
)
})
}
}
deserializer.deserialize_str(U32FromStringVisitor)
}