fix: improve scraper and remove duplicates

This commit is contained in:
Tal 2025-08-08 11:34:10 +02:00
parent 85fd109a05
commit 542807b5af
4 changed files with 1591 additions and 1417 deletions

145
Cargo.lock generated
View File

@ -471,6 +471,19 @@ dependencies = [
"crossbeam-utils",
]
[[package]]
name = "console"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e09ced7ebbccb63b4c65413d821f2e00ce54c5ca4514ddc6b3c892fdbcbc69d"
dependencies = [
"encode_unicode",
"libc",
"once_cell",
"unicode-width 0.2.1",
"windows-sys 0.60.2",
]
[[package]]
name = "core-foundation"
version = "0.9.4"
@ -691,6 +704,12 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c6ba7d4eec39eaa9ab24d44a0e73a7949a1095a8b3f3abb11eddf27dbb56a53"
[[package]]
name = "encode_unicode"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
[[package]]
name = "encoding_rs"
version = "0.8.35"
@ -953,7 +972,7 @@ version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
"unicode-width 0.1.14",
]
[[package]]
@ -1306,6 +1325,19 @@ dependencies = [
"hashbrown",
]
[[package]]
name = "indicatif"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70a646d946d06bedbbc4cac4c218acf4bbf2d87757a784857025f4d447e4e1cd"
dependencies = [
"console",
"portable-atomic",
"unicode-width 0.2.1",
"unit-prefix",
"web-time",
]
[[package]]
name = "ipnet"
version = "2.10.1"
@ -1420,6 +1452,7 @@ dependencies = [
"eyre",
"futures",
"html-escape",
"indicatif",
"lazy_static",
"libc",
"mpris-server",
@ -1959,6 +1992,12 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "portable-atomic"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483"
[[package]]
name = "ppv-lite86"
version = "0.2.20"
@ -2968,6 +3007,18 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
[[package]]
name = "unicode-width"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c"
[[package]]
name = "unit-prefix"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "323402cff2dd658f39ca17c789b502021b3f18707c91cdf22e3838e1b4023817"
[[package]]
name = "untrusted"
version = "0.9.0"
@ -3129,6 +3180,16 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "web-time"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "winapi"
version = "0.3.9"
@ -3180,6 +3241,12 @@ dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-link"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
[[package]]
name = "windows-registry"
version = "0.2.0"
@ -3255,6 +3322,15 @@ dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-sys"
version = "0.60.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
dependencies = [
"windows-targets 0.53.3",
]
[[package]]
name = "windows-targets"
version = "0.42.2"
@ -3294,13 +3370,30 @@ dependencies = [
"windows_aarch64_gnullvm 0.52.6",
"windows_aarch64_msvc 0.52.6",
"windows_i686_gnu 0.52.6",
"windows_i686_gnullvm",
"windows_i686_gnullvm 0.52.6",
"windows_i686_msvc 0.52.6",
"windows_x86_64_gnu 0.52.6",
"windows_x86_64_gnullvm 0.52.6",
"windows_x86_64_msvc 0.52.6",
]
[[package]]
name = "windows-targets"
version = "0.53.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91"
dependencies = [
"windows-link",
"windows_aarch64_gnullvm 0.53.0",
"windows_aarch64_msvc 0.53.0",
"windows_i686_gnu 0.53.0",
"windows_i686_gnullvm 0.53.0",
"windows_i686_msvc 0.53.0",
"windows_x86_64_gnu 0.53.0",
"windows_x86_64_gnullvm 0.53.0",
"windows_x86_64_msvc 0.53.0",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.42.2"
@ -3319,6 +3412,12 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
[[package]]
name = "windows_aarch64_msvc"
version = "0.42.2"
@ -3337,6 +3436,12 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_aarch64_msvc"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
[[package]]
name = "windows_i686_gnu"
version = "0.42.2"
@ -3355,12 +3460,24 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnu"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_gnullvm"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
[[package]]
name = "windows_i686_msvc"
version = "0.42.2"
@ -3379,6 +3496,12 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_i686_msvc"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
[[package]]
name = "windows_x86_64_gnu"
version = "0.42.2"
@ -3397,6 +3520,12 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnu"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.42.2"
@ -3415,6 +3544,12 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
[[package]]
name = "windows_x86_64_msvc"
version = "0.42.2"
@ -3433,6 +3568,12 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "windows_x86_64_msvc"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
[[package]]
name = "winnow"
version = "0.6.22"

View File

@ -57,3 +57,4 @@ unicode-segmentation = "1.12.0"
serde = { version = "1.0.219", features = ["derive"] }
serde_json = "1.0.142"
html-escape = "0.2.13"
indicatif = "0.18.0"

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +1,19 @@
use eyre::{bail, eyre};
use futures::{stream::FuturesOrdered, StreamExt};
use indicatif::ProgressBar;
use lazy_static::lazy_static;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::{
fmt,
path::{Path, PathBuf},
};
use reqwest::Client;
use scraper::{Html, Selector};
use serde::Deserialize;
use serde::{
de::{self, Visitor},
Deserialize, Deserializer,
};
use tokio::{
fs::{self, File},
io::AsyncWriteExt,
@ -24,10 +32,19 @@ lazy_static! {
#[serde(rename_all = "camelCase")]
pub struct Track {
title: String,
file_id: String,
#[serde(deserialize_with = "deserialize_u32_from_string")]
file_id: u32,
artists: String,
}
impl Track {
pub fn clean(&mut self) {
self.artists = html_escape::decode_html_entities(&self.artists).to_string();
self.title = html_escape::decode_html_entities(&self.title).to_string();
}
}
#[derive(Deserialize, Debug)]
struct Release {
#[serde(skip)]
@ -39,15 +56,19 @@ struct Release {
#[derive(thiserror::Error, Debug)]
enum ReleaseError {
#[error("invalid track: {0}")]
#[error("invalid release: {0}")]
Invalid(#[from] eyre::Error),
#[error("track explicitly ignored")]
#[error("release explicitly ignored")]
Ignored,
}
impl Release {
pub async fn scan(path: String, client: Client) -> Result<Self, ReleaseError> {
pub async fn scan(
path: String,
client: Client,
bar: ProgressBar,
) -> Result<Self, ReleaseError> {
let content = get(&client, &path).await?;
let html = Html::parse_document(&content);
@ -66,7 +87,7 @@ impl Release {
return Err(ReleaseError::Ignored);
}
eprintln!("finished scanning {path}!");
bar.inc(release.tracks.len() as u64);
Ok(release)
}
@ -122,6 +143,7 @@ async fn get(client: &Client, path: &str) -> eyre::Result<String> {
async fn scan_page(
number: usize,
client: &Client,
bar: ProgressBar,
) -> eyre::Result<Vec<impl futures::Future<Output = Result<Release, ReleaseError>>>> {
let path = format!("releases/?page={number}");
let content = get(client, &path).await?;
@ -131,11 +153,15 @@ async fn scan_page(
Ok(elements
.filter_map(|x| {
let label = x.select(&RELEASE_LABEL).next()?.inner_html();
if label == "Compilation" || label == "Mix" {
if label == "Compilation" {
return None;
}
Some(Release::scan(x.attr("href")?.to_string(), client.clone()))
Some(Release::scan(
x.attr("href")?.to_string(),
client.clone(),
bar.clone(),
))
})
.collect())
}
@ -143,37 +169,76 @@ async fn scan_page(
pub async fn scrape() -> eyre::Result<()> {
const PAGE_COUNT: usize = 40;
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
const TRACK_COUNT: u64 = 1500;
fs::create_dir_all("./cache/chillhop").await.unwrap();
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let mut futures = FuturesOrdered::new();
let bar = ProgressBar::new(TRACK_COUNT);
// This is slightly less memory efficient than I'd hope, but it is what it is.
for page in 0..=PAGE_COUNT {
for x in scan_page(page, &client).await? {
for x in scan_page(page, &client, bar.clone()).await? {
futures.push_front(x);
}
}
let mut printed = Vec::with_capacity(1500);
while let Some(result) = futures.next().await {
let release = match result {
Ok(release) => release,
Err(error) => {
eprintln!("error: {}, skipping", error);
Err(_) => {
continue;
}
};
for track in release.tracks {
let title = html_escape::decode_html_entities(&track.title);
let artist = html_escape::decode_html_entities(
track.artists.split(", ").next().unwrap_or(&track.artists),
);
for mut track in release.tracks {
if track.file_id == 74707 {
continue;
}
println!("{}!{artist} - {title}", track.file_id)
if printed.contains(&track.file_id) {
continue;
}
printed.push(track.file_id);
track.clean();
println!("{}!{}", track.file_id, track.title);
}
}
bar.finish();
Ok(())
}
pub fn deserialize_u32_from_string<'de, D>(deserializer: D) -> Result<u32, D::Error>
where
D: Deserializer<'de>,
{
struct U32FromStringVisitor;
impl<'de> Visitor<'de> for U32FromStringVisitor {
type Value = u32;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a string containing an unsigned 32-bit integer")
}
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
where
E: de::Error,
{
u32::from_str(value).map_err(|_| {
de::Error::invalid_value(
de::Unexpected::Str(value),
&"a valid unsigned 32-bit integer",
)
})
}
}
deserializer.deserialize_str(U32FromStringVisitor)
}