mirror of
https://github.com/talwat/lowfi
synced 2025-08-16 14:42:18 +00:00
fix: improve scraper and remove duplicates
This commit is contained in:
parent
85fd109a05
commit
542807b5af
145
Cargo.lock
generated
145
Cargo.lock
generated
@ -471,6 +471,19 @@ dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "console"
|
||||
version = "0.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2e09ced7ebbccb63b4c65413d821f2e00ce54c5ca4514ddc6b3c892fdbcbc69d"
|
||||
dependencies = [
|
||||
"encode_unicode",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"unicode-width 0.2.1",
|
||||
"windows-sys 0.60.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.9.4"
|
||||
@ -691,6 +704,12 @@ version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c6ba7d4eec39eaa9ab24d44a0e73a7949a1095a8b3f3abb11eddf27dbb56a53"
|
||||
|
||||
[[package]]
|
||||
name = "encode_unicode"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.35"
|
||||
@ -953,7 +972,7 @@ version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
"unicode-width 0.1.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1306,6 +1325,19 @@ dependencies = [
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indicatif"
|
||||
version = "0.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70a646d946d06bedbbc4cac4c218acf4bbf2d87757a784857025f4d447e4e1cd"
|
||||
dependencies = [
|
||||
"console",
|
||||
"portable-atomic",
|
||||
"unicode-width 0.2.1",
|
||||
"unit-prefix",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ipnet"
|
||||
version = "2.10.1"
|
||||
@ -1420,6 +1452,7 @@ dependencies = [
|
||||
"eyre",
|
||||
"futures",
|
||||
"html-escape",
|
||||
"indicatif",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"mpris-server",
|
||||
@ -1959,6 +1992,12 @@ dependencies = [
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.20"
|
||||
@ -2968,6 +3007,18 @@ version = "0.1.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c"
|
||||
|
||||
[[package]]
|
||||
name = "unit-prefix"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "323402cff2dd658f39ca17c789b502021b3f18707c91cdf22e3838e1b4023817"
|
||||
|
||||
[[package]]
|
||||
name = "untrusted"
|
||||
version = "0.9.0"
|
||||
@ -3129,6 +3180,16 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-time"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
@ -3180,6 +3241,12 @@ dependencies = [
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-link"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
|
||||
|
||||
[[package]]
|
||||
name = "windows-registry"
|
||||
version = "0.2.0"
|
||||
@ -3255,6 +3322,15 @@ dependencies = [
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.60.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
|
||||
dependencies = [
|
||||
"windows-targets 0.53.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.42.2"
|
||||
@ -3294,13 +3370,30 @@ dependencies = [
|
||||
"windows_aarch64_gnullvm 0.52.6",
|
||||
"windows_aarch64_msvc 0.52.6",
|
||||
"windows_i686_gnu 0.52.6",
|
||||
"windows_i686_gnullvm",
|
||||
"windows_i686_gnullvm 0.52.6",
|
||||
"windows_i686_msvc 0.52.6",
|
||||
"windows_x86_64_gnu 0.52.6",
|
||||
"windows_x86_64_gnullvm 0.52.6",
|
||||
"windows_x86_64_msvc 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.53.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
"windows_aarch64_gnullvm 0.53.0",
|
||||
"windows_aarch64_msvc 0.53.0",
|
||||
"windows_i686_gnu 0.53.0",
|
||||
"windows_i686_gnullvm 0.53.0",
|
||||
"windows_i686_msvc 0.53.0",
|
||||
"windows_x86_64_gnu 0.53.0",
|
||||
"windows_x86_64_gnullvm 0.53.0",
|
||||
"windows_x86_64_msvc 0.53.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.42.2"
|
||||
@ -3319,6 +3412,12 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.53.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.42.2"
|
||||
@ -3337,6 +3436,12 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.53.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.42.2"
|
||||
@ -3355,12 +3460,24 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.53.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.53.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.42.2"
|
||||
@ -3379,6 +3496,12 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.53.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.42.2"
|
||||
@ -3397,6 +3520,12 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.53.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.42.2"
|
||||
@ -3415,6 +3544,12 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.53.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.42.2"
|
||||
@ -3433,6 +3568,12 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.53.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.6.22"
|
||||
|
@ -57,3 +57,4 @@ unicode-segmentation = "1.12.0"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
serde_json = "1.0.142"
|
||||
html-escape = "0.2.13"
|
||||
indicatif = "0.18.0"
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,11 +1,19 @@
|
||||
use eyre::{bail, eyre};
|
||||
use futures::{stream::FuturesOrdered, StreamExt};
|
||||
use indicatif::ProgressBar;
|
||||
use lazy_static::lazy_static;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::{
|
||||
fmt,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use reqwest::Client;
|
||||
use scraper::{Html, Selector};
|
||||
use serde::Deserialize;
|
||||
use serde::{
|
||||
de::{self, Visitor},
|
||||
Deserialize, Deserializer,
|
||||
};
|
||||
use tokio::{
|
||||
fs::{self, File},
|
||||
io::AsyncWriteExt,
|
||||
@ -24,10 +32,19 @@ lazy_static! {
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Track {
|
||||
title: String,
|
||||
file_id: String,
|
||||
#[serde(deserialize_with = "deserialize_u32_from_string")]
|
||||
file_id: u32,
|
||||
artists: String,
|
||||
}
|
||||
|
||||
impl Track {
|
||||
pub fn clean(&mut self) {
|
||||
self.artists = html_escape::decode_html_entities(&self.artists).to_string();
|
||||
|
||||
self.title = html_escape::decode_html_entities(&self.title).to_string();
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct Release {
|
||||
#[serde(skip)]
|
||||
@ -39,15 +56,19 @@ struct Release {
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum ReleaseError {
|
||||
#[error("invalid track: {0}")]
|
||||
#[error("invalid release: {0}")]
|
||||
Invalid(#[from] eyre::Error),
|
||||
|
||||
#[error("track explicitly ignored")]
|
||||
#[error("release explicitly ignored")]
|
||||
Ignored,
|
||||
}
|
||||
|
||||
impl Release {
|
||||
pub async fn scan(path: String, client: Client) -> Result<Self, ReleaseError> {
|
||||
pub async fn scan(
|
||||
path: String,
|
||||
client: Client,
|
||||
bar: ProgressBar,
|
||||
) -> Result<Self, ReleaseError> {
|
||||
let content = get(&client, &path).await?;
|
||||
let html = Html::parse_document(&content);
|
||||
|
||||
@ -66,7 +87,7 @@ impl Release {
|
||||
return Err(ReleaseError::Ignored);
|
||||
}
|
||||
|
||||
eprintln!("finished scanning {path}!");
|
||||
bar.inc(release.tracks.len() as u64);
|
||||
|
||||
Ok(release)
|
||||
}
|
||||
@ -122,6 +143,7 @@ async fn get(client: &Client, path: &str) -> eyre::Result<String> {
|
||||
async fn scan_page(
|
||||
number: usize,
|
||||
client: &Client,
|
||||
bar: ProgressBar,
|
||||
) -> eyre::Result<Vec<impl futures::Future<Output = Result<Release, ReleaseError>>>> {
|
||||
let path = format!("releases/?page={number}");
|
||||
let content = get(client, &path).await?;
|
||||
@ -131,11 +153,15 @@ async fn scan_page(
|
||||
Ok(elements
|
||||
.filter_map(|x| {
|
||||
let label = x.select(&RELEASE_LABEL).next()?.inner_html();
|
||||
if label == "Compilation" || label == "Mix" {
|
||||
if label == "Compilation" {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(Release::scan(x.attr("href")?.to_string(), client.clone()))
|
||||
Some(Release::scan(
|
||||
x.attr("href")?.to_string(),
|
||||
client.clone(),
|
||||
bar.clone(),
|
||||
))
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
@ -143,37 +169,76 @@ async fn scan_page(
|
||||
pub async fn scrape() -> eyre::Result<()> {
|
||||
const PAGE_COUNT: usize = 40;
|
||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
|
||||
const TRACK_COUNT: u64 = 1500;
|
||||
|
||||
fs::create_dir_all("./cache/chillhop").await.unwrap();
|
||||
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
||||
|
||||
let mut futures = FuturesOrdered::new();
|
||||
let bar = ProgressBar::new(TRACK_COUNT);
|
||||
|
||||
// This is slightly less memory efficient than I'd hope, but it is what it is.
|
||||
for page in 0..=PAGE_COUNT {
|
||||
for x in scan_page(page, &client).await? {
|
||||
for x in scan_page(page, &client, bar.clone()).await? {
|
||||
futures.push_front(x);
|
||||
}
|
||||
}
|
||||
|
||||
let mut printed = Vec::with_capacity(1500);
|
||||
while let Some(result) = futures.next().await {
|
||||
let release = match result {
|
||||
Ok(release) => release,
|
||||
Err(error) => {
|
||||
eprintln!("error: {}, skipping", error);
|
||||
Err(_) => {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
for track in release.tracks {
|
||||
let title = html_escape::decode_html_entities(&track.title);
|
||||
let artist = html_escape::decode_html_entities(
|
||||
track.artists.split(", ").next().unwrap_or(&track.artists),
|
||||
);
|
||||
for mut track in release.tracks {
|
||||
if track.file_id == 74707 {
|
||||
continue;
|
||||
}
|
||||
|
||||
println!("{}!{artist} - {title}", track.file_id)
|
||||
if printed.contains(&track.file_id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
printed.push(track.file_id);
|
||||
|
||||
track.clean();
|
||||
println!("{}!{}", track.file_id, track.title);
|
||||
}
|
||||
}
|
||||
|
||||
bar.finish();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn deserialize_u32_from_string<'de, D>(deserializer: D) -> Result<u32, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
struct U32FromStringVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for U32FromStringVisitor {
|
||||
type Value = u32;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("a string containing an unsigned 32-bit integer")
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: de::Error,
|
||||
{
|
||||
u32::from_str(value).map_err(|_| {
|
||||
de::Error::invalid_value(
|
||||
de::Unexpected::Str(value),
|
||||
&"a valid unsigned 32-bit integer",
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_str(U32FromStringVisitor)
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user